Bug Summary

File:llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 16635, column 31
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Target/X86 -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/include -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/build-llvm/lib/Target/X86 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-08-28-193554-24367-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "MCTargetDesc/X86ShuffleDecode.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
21#include "X86MachineFunctionInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/BlockFrequencyInfo.h"
30#include "llvm/Analysis/EHPersonalities.h"
31#include "llvm/Analysis/ObjCARCUtil.h"
32#include "llvm/Analysis/ProfileSummaryInfo.h"
33#include "llvm/Analysis/VectorUtils.h"
34#include "llvm/CodeGen/IntrinsicLowering.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineInstrBuilder.h"
38#include "llvm/CodeGen/MachineJumpTableInfo.h"
39#include "llvm/CodeGen/MachineLoopInfo.h"
40#include "llvm/CodeGen/MachineModuleInfo.h"
41#include "llvm/CodeGen/MachineRegisterInfo.h"
42#include "llvm/CodeGen/TargetLowering.h"
43#include "llvm/CodeGen/WinEHFuncInfo.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
46#include "llvm/IR/DerivedTypes.h"
47#include "llvm/IR/DiagnosticInfo.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/GlobalVariable.h"
51#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Instructions.h"
53#include "llvm/IR/Intrinsics.h"
54#include "llvm/IR/PatternMatch.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/CommandLine.h"
60#include "llvm/Support/Debug.h"
61#include "llvm/Support/ErrorHandling.h"
62#include "llvm/Support/KnownBits.h"
63#include "llvm/Support/MathExtras.h"
64#include "llvm/Target/TargetOptions.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE"x86-isel" "x86-isel"
72
73STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
74
75static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(
76 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
77 cl::desc(
78 "Sets the preferable loop alignment for experiments (as log2 bytes) "
79 "for innermost loops only. If specified, this option overrides "
80 "alignment set by x86-experimental-pref-loop-alignment."),
81 cl::Hidden);
82
83static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
86 "SHIFT, LEA, etc."),
87 cl::Hidden);
88
89static cl::opt<bool> ExperimentalUnorderedISEL(
90 "x86-experimental-unordered-atomic-isel", cl::init(false),
91 cl::desc("Use LoadSDNode and StoreSDNode instead of "
92 "AtomicSDNode for unordered atomic loads and "
93 "stores respectively."),
94 cl::Hidden);
95
96/// Call this when the user attempts to do something unsupported, like
97/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
98/// report_fatal_error, so calling code should attempt to recover without
99/// crashing.
100static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
101 const char *Msg) {
102 MachineFunction &MF = DAG.getMachineFunction();
103 DAG.getContext()->diagnose(
104 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
105}
106
107X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
108 const X86Subtarget &STI)
109 : TargetLowering(TM), Subtarget(STI) {
110 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
111 X86ScalarSSEf64 = Subtarget.hasSSE2();
112 X86ScalarSSEf32 = Subtarget.hasSSE1();
113 X86ScalarSSEf16 = Subtarget.hasFP16();
114 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
115
116 // Set up the TargetLowering object.
117
118 // X86 is weird. It always uses i8 for shift amounts and setcc results.
119 setBooleanContents(ZeroOrOneBooleanContent);
120 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
121 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
122
123 // For 64-bit, since we have so many registers, use the ILP scheduler.
124 // For 32-bit, use the register pressure specific scheduling.
125 // For Atom, always use ILP scheduling.
126 if (Subtarget.isAtom())
127 setSchedulingPreference(Sched::ILP);
128 else if (Subtarget.is64Bit())
129 setSchedulingPreference(Sched::ILP);
130 else
131 setSchedulingPreference(Sched::RegPressure);
132 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
133 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
134
135 // Bypass expensive divides and use cheaper ones.
136 if (TM.getOptLevel() >= CodeGenOpt::Default) {
137 if (Subtarget.hasSlowDivide32())
138 addBypassSlowDiv(32, 8);
139 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
140 addBypassSlowDiv(64, 32);
141 }
142
143 // Setup Windows compiler runtime calls.
144 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
145 static const struct {
146 const RTLIB::Libcall Op;
147 const char * const Name;
148 const CallingConv::ID CC;
149 } LibraryCalls[] = {
150 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
151 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
152 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
153 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
154 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
155 };
156
157 for (const auto &LC : LibraryCalls) {
158 setLibcallName(LC.Op, LC.Name);
159 setLibcallCallingConv(LC.Op, LC.CC);
160 }
161 }
162
163 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
164 // MSVCRT doesn't have powi; fall back to pow
165 setLibcallName(RTLIB::POWI_F32, nullptr);
166 setLibcallName(RTLIB::POWI_F64, nullptr);
167 }
168
169 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
170 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
171 // FIXME: Should we be limiting the atomic size on other configs? Default is
172 // 1024.
173 if (!Subtarget.hasCmpxchg8b())
174 setMaxAtomicSizeInBitsSupported(32);
175
176 // Set up the register classes.
177 addRegisterClass(MVT::i8, &X86::GR8RegClass);
178 addRegisterClass(MVT::i16, &X86::GR16RegClass);
179 addRegisterClass(MVT::i32, &X86::GR32RegClass);
180 if (Subtarget.is64Bit())
181 addRegisterClass(MVT::i64, &X86::GR64RegClass);
182
183 for (MVT VT : MVT::integer_valuetypes())
184 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
185
186 // We don't accept any truncstore of integer registers.
187 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
188 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
189 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
190 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
191 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
192 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
193
194 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
195
196 // SETOEQ and SETUNE require checking two conditions.
197 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
198 setCondCodeAction(ISD::SETOEQ, VT, Expand);
199 setCondCodeAction(ISD::SETUNE, VT, Expand);
200 }
201
202 // Integer absolute.
203 if (Subtarget.hasCMov()) {
204 setOperationAction(ISD::ABS , MVT::i16 , Custom);
205 setOperationAction(ISD::ABS , MVT::i32 , Custom);
206 if (Subtarget.is64Bit())
207 setOperationAction(ISD::ABS , MVT::i64 , Custom);
208 }
209
210 // Signed saturation subtraction.
211 setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
212 setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
213 setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
214 if (Subtarget.is64Bit())
215 setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
216
217 // Funnel shifts.
218 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
219 // For slow shld targets we only lower for code size.
220 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
221
222 setOperationAction(ShiftOp , MVT::i8 , Custom);
223 setOperationAction(ShiftOp , MVT::i16 , Custom);
224 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
225 if (Subtarget.is64Bit())
226 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
227 }
228
229 if (!Subtarget.useSoftFloat()) {
230 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
231 // operation.
232 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
233 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
234 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
235 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
236 // We have an algorithm for SSE2, and we turn this into a 64-bit
237 // FILD or VCVTUSI2SS/SD for other targets.
238 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
239 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
240 // We have an algorithm for SSE2->double, and we turn this into a
241 // 64-bit FILD followed by conditional FADD for other targets.
242 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
243 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
244
245 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
246 // this operation.
247 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
248 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
249 // SSE has no i16 to fp conversion, only i32. We promote in the handler
250 // to allow f80 to use i16 and f64 to use i16 with sse1 only
251 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
252 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
253 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
254 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
255 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
256 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
257 // are Legal, f80 is custom lowered.
258 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
259 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
260
261 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
262 // this operation.
263 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
264 // FIXME: This doesn't generate invalid exception when it should. PR44019.
265 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
266 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
267 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
268 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
269 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
270 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
271 // are Legal, f80 is custom lowered.
272 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
273 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
274
275 // Handle FP_TO_UINT by promoting the destination to a larger signed
276 // conversion.
277 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
278 // FIXME: This doesn't generate invalid exception when it should. PR44019.
279 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
280 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
281 // FIXME: This doesn't generate invalid exception when it should. PR44019.
282 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
283 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
284 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
285 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
286 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
287
288 setOperationAction(ISD::LRINT, MVT::f32, Custom);
289 setOperationAction(ISD::LRINT, MVT::f64, Custom);
290 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
291 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
292
293 if (!Subtarget.is64Bit()) {
294 setOperationAction(ISD::LRINT, MVT::i64, Custom);
295 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
296 }
297 }
298
299 if (Subtarget.hasSSE2()) {
300 // Custom lowering for saturating float to int conversions.
301 // We handle promotion to larger result types manually.
302 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
303 setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
304 setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
305 }
306 if (Subtarget.is64Bit()) {
307 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
308 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
309 }
310 }
311
312 // Handle address space casts between mixed sized pointers.
313 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
314 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
315
316 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
317 if (!X86ScalarSSEf64) {
318 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
319 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
320 if (Subtarget.is64Bit()) {
321 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
322 // Without SSE, i64->f64 goes through memory.
323 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
324 }
325 } else if (!Subtarget.is64Bit())
326 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
327
328 // Scalar integer divide and remainder are lowered to use operations that
329 // produce two results, to match the available instructions. This exposes
330 // the two-result form to trivial CSE, which is able to combine x/y and x%y
331 // into a single instruction.
332 //
333 // Scalar integer multiply-high is also lowered to use two-result
334 // operations, to match the available instructions. However, plain multiply
335 // (low) operations are left as Legal, as there are single-result
336 // instructions for this in x86. Using the two-result multiply instructions
337 // when both high and low results are needed must be arranged by dagcombine.
338 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
339 setOperationAction(ISD::MULHS, VT, Expand);
340 setOperationAction(ISD::MULHU, VT, Expand);
341 setOperationAction(ISD::SDIV, VT, Expand);
342 setOperationAction(ISD::UDIV, VT, Expand);
343 setOperationAction(ISD::SREM, VT, Expand);
344 setOperationAction(ISD::UREM, VT, Expand);
345 }
346
347 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
348 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
349 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
350 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
351 setOperationAction(ISD::BR_CC, VT, Expand);
352 setOperationAction(ISD::SELECT_CC, VT, Expand);
353 }
354 if (Subtarget.is64Bit())
355 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
356 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
357 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
358 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
359
360 setOperationAction(ISD::FREM , MVT::f32 , Expand);
361 setOperationAction(ISD::FREM , MVT::f64 , Expand);
362 setOperationAction(ISD::FREM , MVT::f80 , Expand);
363 setOperationAction(ISD::FREM , MVT::f128 , Expand);
364
365 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
366 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
367 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
368 }
369
370 // Promote the i8 variants and force them on up to i32 which has a shorter
371 // encoding.
372 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
373 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
374
375 if (Subtarget.hasBMI()) {
376 // Promote the i16 zero undef variant and force it on up to i32 when tzcnt
377 // is enabled.
378 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
379 } else {
380 setOperationAction(ISD::CTTZ, MVT::i16, Custom);
381 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
382 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
383 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
384 if (Subtarget.is64Bit()) {
385 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
386 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
387 }
388 }
389
390 if (Subtarget.hasLZCNT()) {
391 // When promoting the i8 variants, force them to i32 for a shorter
392 // encoding.
393 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
394 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
395 } else {
396 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
397 if (VT == MVT::i64 && !Subtarget.is64Bit())
398 continue;
399 setOperationAction(ISD::CTLZ , VT, Custom);
400 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
401 }
402 }
403
404 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
405 ISD::STRICT_FP_TO_FP16}) {
406 // Special handling for half-precision floating point conversions.
407 // If we don't have F16C support, then lower half float conversions
408 // into library calls.
409 setOperationAction(
410 Op, MVT::f32,
411 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
412 // There's never any support for operations beyond MVT::f32.
413 setOperationAction(Op, MVT::f64, Expand);
414 setOperationAction(Op, MVT::f80, Expand);
415 setOperationAction(Op, MVT::f128, Expand);
416 }
417
418 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
419 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
420 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
421 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
422 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
423 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
424 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
425 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
426
427 setOperationAction(ISD::PARITY, MVT::i8, Custom);
428 if (Subtarget.hasPOPCNT()) {
429 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
430 } else {
431 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
432 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
433 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
434 if (Subtarget.is64Bit())
435 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
436 else
437 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
438
439 setOperationAction(ISD::PARITY, MVT::i16, Custom);
440 setOperationAction(ISD::PARITY, MVT::i32, Custom);
441 if (Subtarget.is64Bit())
442 setOperationAction(ISD::PARITY, MVT::i64, Custom);
443 }
444
445 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
446
447 if (!Subtarget.hasMOVBE())
448 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
449
450 // X86 wants to expand cmov itself.
451 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
452 setOperationAction(ISD::SELECT, VT, Custom);
453 setOperationAction(ISD::SETCC, VT, Custom);
454 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
455 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
456 }
457 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
458 if (VT == MVT::i64 && !Subtarget.is64Bit())
459 continue;
460 setOperationAction(ISD::SELECT, VT, Custom);
461 setOperationAction(ISD::SETCC, VT, Custom);
462 }
463
464 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
465 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
466 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
467
468 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
469 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
470 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
471 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
472 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
473 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
474 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
475 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
476
477 // Darwin ABI issue.
478 for (auto VT : { MVT::i32, MVT::i64 }) {
479 if (VT == MVT::i64 && !Subtarget.is64Bit())
480 continue;
481 setOperationAction(ISD::ConstantPool , VT, Custom);
482 setOperationAction(ISD::JumpTable , VT, Custom);
483 setOperationAction(ISD::GlobalAddress , VT, Custom);
484 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
485 setOperationAction(ISD::ExternalSymbol , VT, Custom);
486 setOperationAction(ISD::BlockAddress , VT, Custom);
487 }
488
489 // 64-bit shl, sra, srl (iff 32-bit x86)
490 for (auto VT : { MVT::i32, MVT::i64 }) {
491 if (VT == MVT::i64 && !Subtarget.is64Bit())
492 continue;
493 setOperationAction(ISD::SHL_PARTS, VT, Custom);
494 setOperationAction(ISD::SRA_PARTS, VT, Custom);
495 setOperationAction(ISD::SRL_PARTS, VT, Custom);
496 }
497
498 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
499 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
500
501 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
502
503 // Expand certain atomics
504 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
505 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
506 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
507 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
508 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
509 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
510 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
511 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
512 }
513
514 if (!Subtarget.is64Bit())
515 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
516
517 if (Subtarget.hasCmpxchg16b()) {
518 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
519 }
520
521 // FIXME - use subtarget debug flags
522 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
523 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
524 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
525 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
526 }
527
528 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
529 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
530
531 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
532 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
533
534 setOperationAction(ISD::TRAP, MVT::Other, Legal);
535 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
536 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
537
538 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
539 setOperationAction(ISD::VASTART , MVT::Other, Custom);
540 setOperationAction(ISD::VAEND , MVT::Other, Expand);
541 bool Is64Bit = Subtarget.is64Bit();
542 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
543 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
544
545 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
546 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
547
548 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
549
550 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
551 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
552 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
553
554 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
555 // f32 and f64 use SSE.
556 // Set up the FP register classes.
557 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
558 : &X86::FR32RegClass);
559 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
560 : &X86::FR64RegClass);
561
562 // Disable f32->f64 extload as we can only generate this in one instruction
563 // under optsize. So its easier to pattern match (fpext (load)) for that
564 // case instead of needing to emit 2 instructions for extload in the
565 // non-optsize case.
566 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
567
568 for (auto VT : { MVT::f32, MVT::f64 }) {
569 // Use ANDPD to simulate FABS.
570 setOperationAction(ISD::FABS, VT, Custom);
571
572 // Use XORP to simulate FNEG.
573 setOperationAction(ISD::FNEG, VT, Custom);
574
575 // Use ANDPD and ORPD to simulate FCOPYSIGN.
576 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
577
578 // These might be better off as horizontal vector ops.
579 setOperationAction(ISD::FADD, VT, Custom);
580 setOperationAction(ISD::FSUB, VT, Custom);
581
582 // We don't support sin/cos/fmod
583 setOperationAction(ISD::FSIN , VT, Expand);
584 setOperationAction(ISD::FCOS , VT, Expand);
585 setOperationAction(ISD::FSINCOS, VT, Expand);
586 }
587
588 // Lower this to MOVMSK plus an AND.
589 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
590 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
591
592 } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
593 (UseX87 || Is64Bit)) {
594 // Use SSE for f32, x87 for f64.
595 // Set up the FP register classes.
596 addRegisterClass(MVT::f32, &X86::FR32RegClass);
597 if (UseX87)
598 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
599
600 // Use ANDPS to simulate FABS.
601 setOperationAction(ISD::FABS , MVT::f32, Custom);
602
603 // Use XORP to simulate FNEG.
604 setOperationAction(ISD::FNEG , MVT::f32, Custom);
605
606 if (UseX87)
607 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
608
609 // Use ANDPS and ORPS to simulate FCOPYSIGN.
610 if (UseX87)
611 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
612 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
613
614 // We don't support sin/cos/fmod
615 setOperationAction(ISD::FSIN , MVT::f32, Expand);
616 setOperationAction(ISD::FCOS , MVT::f32, Expand);
617 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
618
619 if (UseX87) {
620 // Always expand sin/cos functions even though x87 has an instruction.
621 setOperationAction(ISD::FSIN, MVT::f64, Expand);
622 setOperationAction(ISD::FCOS, MVT::f64, Expand);
623 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
624 }
625 } else if (UseX87) {
626 // f32 and f64 in x87.
627 // Set up the FP register classes.
628 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
629 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
630
631 for (auto VT : { MVT::f32, MVT::f64 }) {
632 setOperationAction(ISD::UNDEF, VT, Expand);
633 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
634
635 // Always expand sin/cos functions even though x87 has an instruction.
636 setOperationAction(ISD::FSIN , VT, Expand);
637 setOperationAction(ISD::FCOS , VT, Expand);
638 setOperationAction(ISD::FSINCOS, VT, Expand);
639 }
640 }
641
642 // Expand FP32 immediates into loads from the stack, save special cases.
643 if (isTypeLegal(MVT::f32)) {
644 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
645 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
646 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
647 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
648 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
649 } else // SSE immediates.
650 addLegalFPImmediate(APFloat(+0.0f)); // xorps
651 }
652 // Expand FP64 immediates into loads from the stack, save special cases.
653 if (isTypeLegal(MVT::f64)) {
654 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
655 addLegalFPImmediate(APFloat(+0.0)); // FLD0
656 addLegalFPImmediate(APFloat(+1.0)); // FLD1
657 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
658 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
659 } else // SSE immediates.
660 addLegalFPImmediate(APFloat(+0.0)); // xorpd
661 }
662 // Handle constrained floating-point operations of scalar.
663 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
664 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
665 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
666 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
667 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
668 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
669 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
670 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
671 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
672 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
673 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
674 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
675 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
676
677 // We don't support FMA.
678 setOperationAction(ISD::FMA, MVT::f64, Expand);
679 setOperationAction(ISD::FMA, MVT::f32, Expand);
680
681 // f80 always uses X87.
682 if (UseX87) {
683 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
684 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
685 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
686 {
687 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
688 addLegalFPImmediate(TmpFlt); // FLD0
689 TmpFlt.changeSign();
690 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
691
692 bool ignored;
693 APFloat TmpFlt2(+1.0);
694 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
695 &ignored);
696 addLegalFPImmediate(TmpFlt2); // FLD1
697 TmpFlt2.changeSign();
698 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
699 }
700
701 // Always expand sin/cos functions even though x87 has an instruction.
702 setOperationAction(ISD::FSIN , MVT::f80, Expand);
703 setOperationAction(ISD::FCOS , MVT::f80, Expand);
704 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
705
706 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
707 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
708 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
709 setOperationAction(ISD::FRINT, MVT::f80, Expand);
710 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
711 setOperationAction(ISD::FMA, MVT::f80, Expand);
712 setOperationAction(ISD::LROUND, MVT::f80, Expand);
713 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
714 setOperationAction(ISD::LRINT, MVT::f80, Custom);
715 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
716 setOperationAction(ISD::ISNAN, MVT::f80, Custom);
717
718 // Handle constrained floating-point operations of scalar.
719 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
720 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
721 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
722 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
723 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
724 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
725 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
726 // as Custom.
727 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
728 }
729
730 // f128 uses xmm registers, but most operations require libcalls.
731 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
732 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
733 : &X86::VR128RegClass);
734
735 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
736
737 setOperationAction(ISD::FADD, MVT::f128, LibCall);
738 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
739 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
740 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
741 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
742 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
743 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
744 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
745 setOperationAction(ISD::FMA, MVT::f128, LibCall);
746 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
747
748 setOperationAction(ISD::FABS, MVT::f128, Custom);
749 setOperationAction(ISD::FNEG, MVT::f128, Custom);
750 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
751
752 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
753 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
754 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
755 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
756 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
757 // No STRICT_FSINCOS
758 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
759 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
760
761 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
762 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
763 // We need to custom handle any FP_ROUND with an f128 input, but
764 // LegalizeDAG uses the result type to know when to run a custom handler.
765 // So we have to list all legal floating point result types here.
766 if (isTypeLegal(MVT::f32)) {
767 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
768 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
769 }
770 if (isTypeLegal(MVT::f64)) {
771 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
772 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
773 }
774 if (isTypeLegal(MVT::f80)) {
775 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
776 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
777 }
778
779 setOperationAction(ISD::SETCC, MVT::f128, Custom);
780
781 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
782 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
783 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
784 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
785 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
786 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
787 }
788
789 // Always use a library call for pow.
790 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
791 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
792 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
793 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
794
795 setOperationAction(ISD::FLOG, MVT::f80, Expand);
796 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
797 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
798 setOperationAction(ISD::FEXP, MVT::f80, Expand);
799 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
800 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
801 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
802
803 // Some FP actions are always expanded for vector types.
804 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
805 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
806 setOperationAction(ISD::FSIN, VT, Expand);
807 setOperationAction(ISD::FSINCOS, VT, Expand);
808 setOperationAction(ISD::FCOS, VT, Expand);
809 setOperationAction(ISD::FREM, VT, Expand);
810 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
811 setOperationAction(ISD::FPOW, VT, Expand);
812 setOperationAction(ISD::FLOG, VT, Expand);
813 setOperationAction(ISD::FLOG2, VT, Expand);
814 setOperationAction(ISD::FLOG10, VT, Expand);
815 setOperationAction(ISD::FEXP, VT, Expand);
816 setOperationAction(ISD::FEXP2, VT, Expand);
817 }
818
819 // First set operation action for all vector types to either promote
820 // (for widening) or expand (for scalarization). Then we will selectively
821 // turn on ones that can be effectively codegen'd.
822 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
823 setOperationAction(ISD::SDIV, VT, Expand);
824 setOperationAction(ISD::UDIV, VT, Expand);
825 setOperationAction(ISD::SREM, VT, Expand);
826 setOperationAction(ISD::UREM, VT, Expand);
827 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
828 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
829 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
830 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
831 setOperationAction(ISD::FMA, VT, Expand);
832 setOperationAction(ISD::FFLOOR, VT, Expand);
833 setOperationAction(ISD::FCEIL, VT, Expand);
834 setOperationAction(ISD::FTRUNC, VT, Expand);
835 setOperationAction(ISD::FRINT, VT, Expand);
836 setOperationAction(ISD::FNEARBYINT, VT, Expand);
837 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
838 setOperationAction(ISD::MULHS, VT, Expand);
839 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
840 setOperationAction(ISD::MULHU, VT, Expand);
841 setOperationAction(ISD::SDIVREM, VT, Expand);
842 setOperationAction(ISD::UDIVREM, VT, Expand);
843 setOperationAction(ISD::CTPOP, VT, Expand);
844 setOperationAction(ISD::CTTZ, VT, Expand);
845 setOperationAction(ISD::CTLZ, VT, Expand);
846 setOperationAction(ISD::ROTL, VT, Expand);
847 setOperationAction(ISD::ROTR, VT, Expand);
848 setOperationAction(ISD::BSWAP, VT, Expand);
849 setOperationAction(ISD::SETCC, VT, Expand);
850 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
851 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
852 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
853 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
854 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
855 setOperationAction(ISD::TRUNCATE, VT, Expand);
856 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
857 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
858 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
859 setOperationAction(ISD::SELECT_CC, VT, Expand);
860 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
861 setTruncStoreAction(InnerVT, VT, Expand);
862
863 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
864 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
865
866 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
867 // types, we have to deal with them whether we ask for Expansion or not.
868 // Setting Expand causes its own optimisation problems though, so leave
869 // them legal.
870 if (VT.getVectorElementType() == MVT::i1)
871 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
872
873 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
874 // split/scalarized right now.
875 if (VT.getVectorElementType() == MVT::f16)
876 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
877 }
878 }
879
880 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
881 // with -msoft-float, disable use of MMX as well.
882 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
883 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
884 // No operations on x86mmx supported, everything uses intrinsics.
885 }
886
887 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
888 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
889 : &X86::VR128RegClass);
890
891 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
892 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
893 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
894 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
895 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
896 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
897 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
898 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
899
900 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
901 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
902
903 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
904 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
905 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
906 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
907 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
908 }
909
910 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
911 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
912 : &X86::VR128RegClass);
913
914 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
915 // registers cannot be used even for integer operations.
916 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
917 : &X86::VR128RegClass);
918 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
919 : &X86::VR128RegClass);
920 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
921 : &X86::VR128RegClass);
922 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
923 : &X86::VR128RegClass);
924
925 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
926 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
927 setOperationAction(ISD::SDIV, VT, Custom);
928 setOperationAction(ISD::SREM, VT, Custom);
929 setOperationAction(ISD::UDIV, VT, Custom);
930 setOperationAction(ISD::UREM, VT, Custom);
931 }
932
933 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
934 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
935 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
936
937 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
938 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
939 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
940 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
941 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
942 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
943 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
944 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
945 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
946 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
947
948 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
949 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
950
951 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
952 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
953 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
954
955 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
956 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
957 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
958 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
959 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
960 }
961
962 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
963 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
964 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
965 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
966 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
967 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
968 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
969 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
970 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
971 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
972
973 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
974 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
975 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
976
977 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
978 setOperationAction(ISD::SETCC, VT, Custom);
979 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
980 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
981 setOperationAction(ISD::CTPOP, VT, Custom);
982 setOperationAction(ISD::ABS, VT, Custom);
983
984 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
985 // setcc all the way to isel and prefer SETGT in some isel patterns.
986 setCondCodeAction(ISD::SETLT, VT, Custom);
987 setCondCodeAction(ISD::SETLE, VT, Custom);
988 }
989
990 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
991 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
992 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
993 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
994 setOperationAction(ISD::VSELECT, VT, Custom);
995 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
996 }
997
998 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
999 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1000 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1001 setOperationAction(ISD::VSELECT, VT, Custom);
1002
1003 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1004 continue;
1005
1006 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1007 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1008 }
1009
1010 // Custom lower v2i64 and v2f64 selects.
1011 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
1012 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
1013 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
1014 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
1015 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
1016
1017 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1018 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
1019 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
1020 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1021 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
1022 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
1023
1024 // Custom legalize these to avoid over promotion or custom promotion.
1025 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1026 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1027 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1028 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1029 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1030 }
1031
1032 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1033 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
1034 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1035 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1036
1037 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1038 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1039
1040 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1041 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1042
1043 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1044 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1045 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1046 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1047 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1048
1049 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1050 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1051 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1052 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1053
1054 // We want to legalize this to an f64 load rather than an i64 load on
1055 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1056 // store.
1057 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1058 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1059 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1060 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1061 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1062 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1063
1064 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1065 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1066 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1067 if (!Subtarget.hasAVX512())
1068 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1069
1070 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1071 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1072 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1073
1074 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1075
1076 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1077 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1078 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1079 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1080 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1081 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1082
1083 // In the customized shift lowering, the legal v4i32/v2i64 cases
1084 // in AVX2 will be recognized.
1085 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1086 setOperationAction(ISD::SRL, VT, Custom);
1087 setOperationAction(ISD::SHL, VT, Custom);
1088 setOperationAction(ISD::SRA, VT, Custom);
1089 }
1090
1091 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1092 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1093
1094 // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1095 // shifts) is better.
1096 if (!Subtarget.useAVX512Regs() &&
1097 !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1098 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1099
1100 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1101 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1102 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1103 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1104 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1105 }
1106
1107 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1108 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1109 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1110 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1111 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1112 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1113 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1114 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1115 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1116
1117 // These might be better off as horizontal vector ops.
1118 setOperationAction(ISD::ADD, MVT::i16, Custom);
1119 setOperationAction(ISD::ADD, MVT::i32, Custom);
1120 setOperationAction(ISD::SUB, MVT::i16, Custom);
1121 setOperationAction(ISD::SUB, MVT::i32, Custom);
1122 }
1123
1124 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1125 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1126 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1127 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1128 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1129 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1130 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1131 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1132 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1133 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1134 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1135 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1136 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1137 setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
1138
1139 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1140 }
1141
1142 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1143 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1144 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1145 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1146 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1147 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1148 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1149 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1150
1151 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
1152 setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
1153 setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
1154
1155 // FIXME: Do we need to handle scalar-to-vector here?
1156 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1157
1158 // We directly match byte blends in the backend as they match the VSELECT
1159 // condition form.
1160 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1161
1162 // SSE41 brings specific instructions for doing vector sign extend even in
1163 // cases where we don't have SRA.
1164 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1165 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1166 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1167 }
1168
1169 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1170 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1171 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1172 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1173 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1174 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1175 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1176 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1177 }
1178
1179 // i8 vectors are custom because the source register and source
1180 // source memory operand types are not the same width.
1181 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1182
1183 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1184 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1185 // do the pre and post work in the vector domain.
1186 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1187 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1188 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1189 // so that DAG combine doesn't try to turn it into uint_to_fp.
1190 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1191 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1192 }
1193 }
1194
1195 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1196 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
1197 }
1198
1199 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1200 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1201 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1202 setOperationAction(ISD::ROTL, VT, Custom);
1203
1204 // XOP can efficiently perform BITREVERSE with VPPERM.
1205 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1206 setOperationAction(ISD::BITREVERSE, VT, Custom);
1207
1208 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1209 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1210 setOperationAction(ISD::BITREVERSE, VT, Custom);
1211 }
1212
1213 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1214 bool HasInt256 = Subtarget.hasInt256();
1215
1216 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1217 : &X86::VR256RegClass);
1218 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1219 : &X86::VR256RegClass);
1220 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1221 : &X86::VR256RegClass);
1222 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1223 : &X86::VR256RegClass);
1224 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1225 : &X86::VR256RegClass);
1226 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1227 : &X86::VR256RegClass);
1228
1229 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1230 setOperationAction(ISD::FFLOOR, VT, Legal);
1231 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1232 setOperationAction(ISD::FCEIL, VT, Legal);
1233 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1234 setOperationAction(ISD::FTRUNC, VT, Legal);
1235 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1236 setOperationAction(ISD::FRINT, VT, Legal);
1237 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1238 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1239 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1240 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1241 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1242
1243 setOperationAction(ISD::FROUND, VT, Custom);
1244
1245 setOperationAction(ISD::FNEG, VT, Custom);
1246 setOperationAction(ISD::FABS, VT, Custom);
1247 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1248 }
1249
1250 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1251 // even though v8i16 is a legal type.
1252 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1253 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1254 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1255 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1256 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1257 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
1258 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
1259
1260 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1261 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
1262
1263 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1264 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1265 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1266 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1267 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1268 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1269 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1270 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1271 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1272 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
1273 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1274 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1275
1276 if (!Subtarget.hasAVX512())
1277 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1278
1279 // In the customized shift lowering, the legal v8i32/v4i64 cases
1280 // in AVX2 will be recognized.
1281 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1282 setOperationAction(ISD::SRL, VT, Custom);
1283 setOperationAction(ISD::SHL, VT, Custom);
1284 setOperationAction(ISD::SRA, VT, Custom);
1285 }
1286
1287 // These types need custom splitting if their input is a 128-bit vector.
1288 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1289 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1290 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1291 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1292
1293 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1294 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1295
1296 // With BWI, expanding (and promoting the shifts) is the better.
1297 if (!Subtarget.useBWIRegs())
1298 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1299
1300 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1301 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1302 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1303 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1304 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1305 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1306
1307 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1308 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1309 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1310 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1311 }
1312
1313 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1314 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1315 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1316 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1317
1318 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1319 setOperationAction(ISD::SETCC, VT, Custom);
1320 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1321 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1322 setOperationAction(ISD::CTPOP, VT, Custom);
1323 setOperationAction(ISD::CTLZ, VT, Custom);
1324
1325 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1326 // setcc all the way to isel and prefer SETGT in some isel patterns.
1327 setCondCodeAction(ISD::SETLT, VT, Custom);
1328 setCondCodeAction(ISD::SETLE, VT, Custom);
1329 }
1330
1331 if (Subtarget.hasAnyFMA()) {
1332 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1333 MVT::v2f64, MVT::v4f64 }) {
1334 setOperationAction(ISD::FMA, VT, Legal);
1335 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1336 }
1337 }
1338
1339 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1340 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1341 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1342 }
1343
1344 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1345 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1346 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1347 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1348
1349 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1350 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1351 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1352 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1353 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1354 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1355
1356 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1357 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1358
1359 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1360 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1361 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1362 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1363 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1364
1365 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1366 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1367 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1368 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1369 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1370 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1371 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1372 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1373 setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
1374 setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
1375 setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
1376 setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
1377
1378 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1379 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1380 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1381 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1382 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1383 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1384 }
1385
1386 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1387 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1388 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1389 }
1390
1391 if (HasInt256) {
1392 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1393 // when we have a 256bit-wide blend with immediate.
1394 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1395 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1396
1397 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1398 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1399 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1400 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1405 }
1406 }
1407
1408 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1409 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1410 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1411 setOperationAction(ISD::MSTORE, VT, Legal);
1412 }
1413
1414 // Extract subvector is special because the value type
1415 // (result) is 128-bit but the source is 256-bit wide.
1416 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1417 MVT::v4f32, MVT::v2f64 }) {
1418 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1419 }
1420
1421 // Custom lower several nodes for 256-bit types.
1422 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1423 MVT::v8f32, MVT::v4f64 }) {
1424 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1425 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1426 setOperationAction(ISD::VSELECT, VT, Custom);
1427 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1428 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1429 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1430 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1431 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1432 setOperationAction(ISD::STORE, VT, Custom);
1433 }
1434
1435 if (HasInt256) {
1436 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1437
1438 // Custom legalize 2x32 to get a little better code.
1439 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1440 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1441
1442 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1443 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1444 setOperationAction(ISD::MGATHER, VT, Custom);
1445 }
1446 }
1447
1448 // This block controls legalization of the mask vector sizes that are
1449 // available with AVX512. 512-bit vectors are in a separate block controlled
1450 // by useAVX512Regs.
1451 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1452 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1453 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1454 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1455 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1456 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1457
1458 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1459 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1460 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1461
1462 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1463 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1464 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1465 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1466 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1467 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1468 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1469 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1470 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1471 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1472 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1473 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1474
1475 // There is no byte sized k-register load or store without AVX512DQ.
1476 if (!Subtarget.hasDQI()) {
1477 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1478 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1479 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1480 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1481
1482 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1483 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1484 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1485 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1486 }
1487
1488 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1489 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1490 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1491 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1492 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1493 }
1494
1495 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1496 setOperationAction(ISD::VSELECT, VT, Expand);
1497
1498 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1499 setOperationAction(ISD::SETCC, VT, Custom);
1500 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1501 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1502 setOperationAction(ISD::SELECT, VT, Custom);
1503 setOperationAction(ISD::TRUNCATE, VT, Custom);
1504
1505 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1506 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1507 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1508 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1509 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1510 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1511 }
1512
1513 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1514 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1515 }
1516
1517 // This block controls legalization for 512-bit operations with 32/64 bit
1518 // elements. 512-bits can be disabled based on prefer-vector-width and
1519 // required-vector-width function attributes.
1520 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1521 bool HasBWI = Subtarget.hasBWI();
1522
1523 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1524 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1525 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1526 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1527 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1528 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1529
1530 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1531 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1532 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1533 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1534 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1535 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1536 if (HasBWI)
1537 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1538 }
1539
1540 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1541 setOperationAction(ISD::FNEG, VT, Custom);
1542 setOperationAction(ISD::FABS, VT, Custom);
1543 setOperationAction(ISD::FMA, VT, Legal);
1544 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1545 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1546 }
1547
1548 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1549 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1550 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1551 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1552 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1553 }
1554 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1555 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1556 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1557 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1558 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1559 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1560 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1561 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1562
1563 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1564 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1565 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1566 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1567 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1568 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1569 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1570 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1571 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1572 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1573 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
1574 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1575
1576 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1577 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1578 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1579 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1580 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1581 if (HasBWI)
1582 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1583
1584 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1585 // to 512-bit rather than use the AVX2 instructions so that we can use
1586 // k-masks.
1587 if (!Subtarget.hasVLX()) {
1588 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1589 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1590 setOperationAction(ISD::MLOAD, VT, Custom);
1591 setOperationAction(ISD::MSTORE, VT, Custom);
1592 }
1593 }
1594
1595 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
1596 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1597 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1598 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1599 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1600 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1601 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1602 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1603 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1604 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1605 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1606 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1607 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1608
1609 if (HasBWI) {
1610 // Extends from v64i1 masks to 512-bit vectors.
1611 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1612 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1613 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1614 }
1615
1616 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1617 setOperationAction(ISD::FFLOOR, VT, Legal);
1618 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1619 setOperationAction(ISD::FCEIL, VT, Legal);
1620 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1621 setOperationAction(ISD::FTRUNC, VT, Legal);
1622 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1623 setOperationAction(ISD::FRINT, VT, Legal);
1624 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1625 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1626 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1627 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1628 setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
1629
1630 setOperationAction(ISD::FROUND, VT, Custom);
1631 }
1632
1633 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1634 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1635 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1636 }
1637
1638 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1639 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1640 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1641 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1642
1643 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1644 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1645 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1646 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1647
1648 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1649 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1650 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1651 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1652 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1653 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1654
1655 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1656 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1657
1658 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1659
1660 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1661 setOperationAction(ISD::SRL, VT, Custom);
1662 setOperationAction(ISD::SHL, VT, Custom);
1663 setOperationAction(ISD::SRA, VT, Custom);
1664 setOperationAction(ISD::SETCC, VT, Custom);
1665
1666 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1667 // setcc all the way to isel and prefer SETGT in some isel patterns.
1668 setCondCodeAction(ISD::SETLT, VT, Custom);
1669 setCondCodeAction(ISD::SETLE, VT, Custom);
1670 }
1671 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1672 setOperationAction(ISD::SMAX, VT, Legal);
1673 setOperationAction(ISD::UMAX, VT, Legal);
1674 setOperationAction(ISD::SMIN, VT, Legal);
1675 setOperationAction(ISD::UMIN, VT, Legal);
1676 setOperationAction(ISD::ABS, VT, Legal);
1677 setOperationAction(ISD::CTPOP, VT, Custom);
1678 setOperationAction(ISD::ROTL, VT, Custom);
1679 setOperationAction(ISD::ROTR, VT, Custom);
1680 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1681 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1682 }
1683
1684 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1685 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1686 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1687 setOperationAction(ISD::CTLZ, VT, Custom);
1688 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1689 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1690 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1691 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1692 setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1693 setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1694 setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1695 setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1696 }
1697
1698 if (Subtarget.hasDQI()) {
1699 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1700 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1701 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1702 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1703 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1704 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1705 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1706 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1707
1708 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1709 }
1710
1711 if (Subtarget.hasCDI()) {
1712 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1713 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1714 setOperationAction(ISD::CTLZ, VT, Legal);
1715 }
1716 } // Subtarget.hasCDI()
1717
1718 if (Subtarget.hasVPOPCNTDQ()) {
1719 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1720 setOperationAction(ISD::CTPOP, VT, Legal);
1721 }
1722
1723 // Extract subvector is special because the value type
1724 // (result) is 256-bit but the source is 512-bit wide.
1725 // 128-bit was made Legal under AVX1.
1726 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1727 MVT::v8f32, MVT::v4f64 })
1728 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1729
1730 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1731 MVT::v16f32, MVT::v8f64 }) {
1732 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1733 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1734 setOperationAction(ISD::SELECT, VT, Custom);
1735 setOperationAction(ISD::VSELECT, VT, Custom);
1736 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1737 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1738 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1739 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1740 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1741 }
1742
1743 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1744 setOperationAction(ISD::MLOAD, VT, Legal);
1745 setOperationAction(ISD::MSTORE, VT, Legal);
1746 setOperationAction(ISD::MGATHER, VT, Custom);
1747 setOperationAction(ISD::MSCATTER, VT, Custom);
1748 }
1749 if (HasBWI) {
1750 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1751 setOperationAction(ISD::MLOAD, VT, Legal);
1752 setOperationAction(ISD::MSTORE, VT, Legal);
1753 }
1754 } else {
1755 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
1756 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
1757 }
1758
1759 if (Subtarget.hasVBMI2()) {
1760 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1761 MVT::v16i16, MVT::v8i32, MVT::v4i64,
1762 MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1763 setOperationAction(ISD::FSHL, VT, Custom);
1764 setOperationAction(ISD::FSHR, VT, Custom);
1765 }
1766
1767 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
1768 setOperationAction(ISD::ROTR, MVT::v8i16, Custom);
1769 setOperationAction(ISD::ROTR, MVT::v16i16, Custom);
1770 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
1771 }
1772 }// useAVX512Regs
1773
1774 // This block controls legalization for operations that don't have
1775 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1776 // narrower widths.
1777 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1778 // These operations are handled on non-VLX by artificially widening in
1779 // isel patterns.
1780
1781 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1782 Subtarget.hasVLX() ? Legal : Custom);
1783 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1784 Subtarget.hasVLX() ? Legal : Custom);
1785 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1786 Subtarget.hasVLX() ? Legal : Custom);
1787 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1788 Subtarget.hasVLX() ? Legal : Custom);
1789 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1790 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1791 Subtarget.hasVLX() ? Legal : Custom);
1792 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1793 Subtarget.hasVLX() ? Legal : Custom);
1794 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1795 Subtarget.hasVLX() ? Legal : Custom);
1796 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1797 Subtarget.hasVLX() ? Legal : Custom);
1798
1799 if (Subtarget.hasDQI()) {
1800 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1801 // v2f32 UINT_TO_FP is already custom under SSE2.
1802 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1804, __extension__ __PRETTY_FUNCTION__))
1803 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1804, __extension__ __PRETTY_FUNCTION__))
1804 "Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1804, __extension__ __PRETTY_FUNCTION__))
;
1805 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1806 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1807 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1808 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1809 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1810 }
1811
1812 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1813 setOperationAction(ISD::SMAX, VT, Legal);
1814 setOperationAction(ISD::UMAX, VT, Legal);
1815 setOperationAction(ISD::SMIN, VT, Legal);
1816 setOperationAction(ISD::UMIN, VT, Legal);
1817 setOperationAction(ISD::ABS, VT, Legal);
1818 }
1819
1820 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1821 setOperationAction(ISD::ROTL, VT, Custom);
1822 setOperationAction(ISD::ROTR, VT, Custom);
1823 }
1824
1825 // Custom legalize 2x32 to get a little better code.
1826 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1827 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1828
1829 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1830 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1831 setOperationAction(ISD::MSCATTER, VT, Custom);
1832
1833 if (Subtarget.hasDQI()) {
1834 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1835 setOperationAction(ISD::SINT_TO_FP, VT,
1836 Subtarget.hasVLX() ? Legal : Custom);
1837 setOperationAction(ISD::UINT_TO_FP, VT,
1838 Subtarget.hasVLX() ? Legal : Custom);
1839 setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1840 Subtarget.hasVLX() ? Legal : Custom);
1841 setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1842 Subtarget.hasVLX() ? Legal : Custom);
1843 setOperationAction(ISD::FP_TO_SINT, VT,
1844 Subtarget.hasVLX() ? Legal : Custom);
1845 setOperationAction(ISD::FP_TO_UINT, VT,
1846 Subtarget.hasVLX() ? Legal : Custom);
1847 setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1848 Subtarget.hasVLX() ? Legal : Custom);
1849 setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1850 Subtarget.hasVLX() ? Legal : Custom);
1851 setOperationAction(ISD::MUL, VT, Legal);
1852 }
1853 }
1854
1855 if (Subtarget.hasCDI()) {
1856 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1857 setOperationAction(ISD::CTLZ, VT, Legal);
1858 }
1859 } // Subtarget.hasCDI()
1860
1861 if (Subtarget.hasVPOPCNTDQ()) {
1862 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1863 setOperationAction(ISD::CTPOP, VT, Legal);
1864 }
1865 }
1866
1867 // This block control legalization of v32i1/v64i1 which are available with
1868 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1869 // useBWIRegs.
1870 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1871 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1872 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1873
1874 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1875 setOperationAction(ISD::VSELECT, VT, Expand);
1876 setOperationAction(ISD::TRUNCATE, VT, Custom);
1877 setOperationAction(ISD::SETCC, VT, Custom);
1878 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1879 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1880 setOperationAction(ISD::SELECT, VT, Custom);
1881 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1882 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1883 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1884 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1885 }
1886
1887 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1888 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1889
1890 // Extends from v32i1 masks to 256-bit vectors.
1891 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1892 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1893 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1894
1895 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1896 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1897 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1898 }
1899
1900 // These operations are handled on non-VLX by artificially widening in
1901 // isel patterns.
1902 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1903
1904 if (Subtarget.hasBITALG()) {
1905 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1906 setOperationAction(ISD::CTPOP, VT, Legal);
1907 }
1908 }
1909
1910 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
1911 auto setGroup = [&] (MVT VT) {
1912 setOperationAction(ISD::FADD, VT, Legal);
1913 setOperationAction(ISD::STRICT_FADD, VT, Legal);
1914 setOperationAction(ISD::FSUB, VT, Legal);
1915 setOperationAction(ISD::STRICT_FSUB, VT, Legal);
1916 setOperationAction(ISD::FMUL, VT, Legal);
1917 setOperationAction(ISD::STRICT_FMUL, VT, Legal);
1918 setOperationAction(ISD::FDIV, VT, Legal);
1919 setOperationAction(ISD::STRICT_FDIV, VT, Legal);
1920 setOperationAction(ISD::FSQRT, VT, Legal);
1921 setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
1922
1923 setOperationAction(ISD::FFLOOR, VT, Legal);
1924 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1925 setOperationAction(ISD::FCEIL, VT, Legal);
1926 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1927 setOperationAction(ISD::FTRUNC, VT, Legal);
1928 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1929 setOperationAction(ISD::FRINT, VT, Legal);
1930 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1931 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1932 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1933
1934 setOperationAction(ISD::LOAD, VT, Legal);
1935 setOperationAction(ISD::STORE, VT, Legal);
1936
1937 setOperationAction(ISD::FMA, VT, Legal);
1938 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1939 setOperationAction(ISD::VSELECT, VT, Legal);
1940 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1941 setOperationAction(ISD::SELECT, VT, Custom);
1942
1943 setOperationAction(ISD::FNEG, VT, Custom);
1944 setOperationAction(ISD::FABS, VT, Custom);
1945 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1946 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1947 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1948 };
1949
1950 // AVX512_FP16 scalar operations
1951 setGroup(MVT::f16);
1952 addRegisterClass(MVT::f16, &X86::FR16XRegClass);
1953 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
1954 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
1955 setOperationAction(ISD::SETCC, MVT::f16, Custom);
1956 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
1957 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
1958 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
1959 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
1960 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
1961 if (isTypeLegal(MVT::f80)) {
1962 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
1963 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
1964 }
1965
1966 setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
1967 setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
1968
1969 if (Subtarget.useAVX512Regs()) {
1970 setGroup(MVT::v32f16);
1971 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1972 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
1973 setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
1974 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
1975 setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
1976 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
1977 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
1978 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
1979 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
1980
1981 setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
1982 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
1983 setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
1984 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
1985 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
1986 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
1987 MVT::v32i16);
1988 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
1989 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
1990 MVT::v32i16);
1991 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
1992 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
1993 MVT::v32i16);
1994 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
1995 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
1996 MVT::v32i16);
1997
1998 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
1999 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
2000 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
2001
2002 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2003 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2004
2005 setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);
2006 setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);
2007 }
2008
2009 if (Subtarget.hasVLX()) {
2010 addRegisterClass(MVT::v8f16, &X86::VR128XRegClass);
2011 addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);
2012 setGroup(MVT::v8f16);
2013 setGroup(MVT::v16f16);
2014
2015 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
2016 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
2017 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
2018 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
2019 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
2020 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
2021 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
2022 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
2023 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
2024 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
2025
2026 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
2027 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
2028 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
2029 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
2030 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
2031 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
2032
2033 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2034 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
2035 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
2036
2037 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
2038 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
2039 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
2040
2041 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2042 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2043 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2044 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2045
2046 // Need to custom widen these to prevent scalarization.
2047 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2048 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2049 }
2050
2051 // Support fp16 0 immediate
2052 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
2053 }
2054
2055 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2056 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2057 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2058 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2059 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2060 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2061
2062 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2063 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2064 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2065 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2066 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2067
2068 if (Subtarget.hasBWI()) {
2069 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2070 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2071 }
2072
2073 if (Subtarget.hasFP16()) {
2074 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2075 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
2076 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
2077 setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
2078 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
2079 setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
2080 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
2081 setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
2082 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
2083 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2084 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
2085 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
2086 setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
2087 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
2088 setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
2089 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
2090 setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
2091 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
2092 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2093 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
2094 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
2095 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
2096 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
2097 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2098 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2099 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
2100 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2101 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
2102 }
2103
2104 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
2105 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
2106 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
2107 }
2108
2109 if (Subtarget.hasAMXTILE()) {
2110 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2111 }
2112
2113 // We want to custom lower some of our intrinsics.
2114 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
2115 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
2116 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
2117 if (!Subtarget.is64Bit()) {
2118 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
2119 }
2120
2121 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2122 // handle type legalization for these operations here.
2123 //
2124 // FIXME: We really should do custom legalization for addition and
2125 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2126 // than generic legalization for 64-bit multiplication-with-overflow, though.
2127 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2128 if (VT == MVT::i64 && !Subtarget.is64Bit())
2129 continue;
2130 // Add/Sub/Mul with overflow operations are custom lowered.
2131 setOperationAction(ISD::SADDO, VT, Custom);
2132 setOperationAction(ISD::UADDO, VT, Custom);
2133 setOperationAction(ISD::SSUBO, VT, Custom);
2134 setOperationAction(ISD::USUBO, VT, Custom);
2135 setOperationAction(ISD::SMULO, VT, Custom);
2136 setOperationAction(ISD::UMULO, VT, Custom);
2137
2138 // Support carry in as value rather than glue.
2139 setOperationAction(ISD::ADDCARRY, VT, Custom);
2140 setOperationAction(ISD::SUBCARRY, VT, Custom);
2141 setOperationAction(ISD::SETCCCARRY, VT, Custom);
2142 setOperationAction(ISD::SADDO_CARRY, VT, Custom);
2143 setOperationAction(ISD::SSUBO_CARRY, VT, Custom);
2144 }
2145
2146 if (!Subtarget.is64Bit()) {
2147 // These libcalls are not available in 32-bit.
2148 setLibcallName(RTLIB::SHL_I128, nullptr);
2149 setLibcallName(RTLIB::SRL_I128, nullptr);
2150 setLibcallName(RTLIB::SRA_I128, nullptr);
2151 setLibcallName(RTLIB::MUL_I128, nullptr);
2152 setLibcallName(RTLIB::MULO_I128, nullptr);
2153 }
2154
2155 // Combine sin / cos into _sincos_stret if it is available.
2156 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2157 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2158 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2159 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2160 }
2161
2162 if (Subtarget.isTargetWin64()) {
2163 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2164 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2165 setOperationAction(ISD::SREM, MVT::i128, Custom);
2166 setOperationAction(ISD::UREM, MVT::i128, Custom);
2167 }
2168
2169 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2170 // is. We should promote the value to 64-bits to solve this.
2171 // This is what the CRT headers do - `fmodf` is an inline header
2172 // function casting to f64 and calling `fmod`.
2173 if (Subtarget.is32Bit() &&
2174 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2175 for (ISD::NodeType Op :
2176 {ISD::FCEIL, ISD::STRICT_FCEIL,
2177 ISD::FCOS, ISD::STRICT_FCOS,
2178 ISD::FEXP, ISD::STRICT_FEXP,
2179 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2180 ISD::FREM, ISD::STRICT_FREM,
2181 ISD::FLOG, ISD::STRICT_FLOG,
2182 ISD::FLOG10, ISD::STRICT_FLOG10,
2183 ISD::FPOW, ISD::STRICT_FPOW,
2184 ISD::FSIN, ISD::STRICT_FSIN})
2185 if (isOperationExpand(Op, MVT::f32))
2186 setOperationAction(Op, MVT::f32, Promote);
2187
2188 // We have target-specific dag combine patterns for the following nodes:
2189 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2190 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2191 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
2192 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2193 setTargetDAGCombine(ISD::CONCAT_VECTORS);
2194 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2195 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2196 setTargetDAGCombine(ISD::BITCAST);
2197 setTargetDAGCombine(ISD::VSELECT);
2198 setTargetDAGCombine(ISD::SELECT);
2199 setTargetDAGCombine(ISD::SHL);
2200 setTargetDAGCombine(ISD::SRA);
2201 setTargetDAGCombine(ISD::SRL);
2202 setTargetDAGCombine(ISD::OR);
2203 setTargetDAGCombine(ISD::AND);
2204 setTargetDAGCombine(ISD::ADD);
2205 setTargetDAGCombine(ISD::FADD);
2206 setTargetDAGCombine(ISD::FSUB);
2207 setTargetDAGCombine(ISD::FNEG);
2208 setTargetDAGCombine(ISD::FMA);
2209 setTargetDAGCombine(ISD::STRICT_FMA);
2210 setTargetDAGCombine(ISD::FMINNUM);
2211 setTargetDAGCombine(ISD::FMAXNUM);
2212 setTargetDAGCombine(ISD::SUB);
2213 setTargetDAGCombine(ISD::LOAD);
2214 setTargetDAGCombine(ISD::MLOAD);
2215 setTargetDAGCombine(ISD::STORE);
2216 setTargetDAGCombine(ISD::MSTORE);
2217 setTargetDAGCombine(ISD::TRUNCATE);
2218 setTargetDAGCombine(ISD::ZERO_EXTEND);
2219 setTargetDAGCombine(ISD::ANY_EXTEND);
2220 setTargetDAGCombine(ISD::SIGN_EXTEND);
2221 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2222 setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2223 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2224 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2225 setTargetDAGCombine(ISD::SINT_TO_FP);
2226 setTargetDAGCombine(ISD::UINT_TO_FP);
2227 setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2228 setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2229 setTargetDAGCombine(ISD::SETCC);
2230 setTargetDAGCombine(ISD::MUL);
2231 setTargetDAGCombine(ISD::XOR);
2232 setTargetDAGCombine(ISD::MSCATTER);
2233 setTargetDAGCombine(ISD::MGATHER);
2234 setTargetDAGCombine(ISD::FP16_TO_FP);
2235 setTargetDAGCombine(ISD::FP_EXTEND);
2236 setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2237 setTargetDAGCombine(ISD::FP_ROUND);
2238
2239 computeRegisterProperties(Subtarget.getRegisterInfo());
2240
2241 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2242 MaxStoresPerMemsetOptSize = 8;
2243 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2244 MaxStoresPerMemcpyOptSize = 4;
2245 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2246 MaxStoresPerMemmoveOptSize = 4;
2247
2248 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2249 // that needs to benchmarked and balanced with the potential use of vector
2250 // load/store types (PR33329, PR33914).
2251 MaxLoadsPerMemcmp = 2;
2252 MaxLoadsPerMemcmpOptSize = 2;
2253
2254 // Default loop alignment, which can be overridden by -align-loops.
2255 setPrefLoopAlignment(Align(16));
2256
2257 // An out-of-order CPU can speculatively execute past a predictable branch,
2258 // but a conditional move could be stalled by an expensive earlier operation.
2259 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2260 EnableExtLdPromotion = true;
2261 setPrefFunctionAlignment(Align(16));
2262
2263 verifyIntrinsicTables();
2264
2265 // Default to having -disable-strictnode-mutation on
2266 IsStrictFPEnabled = true;
2267}
2268
2269// This has so far only been implemented for 64-bit MachO.
2270bool X86TargetLowering::useLoadStackGuardNode() const {
2271 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2272}
2273
2274bool X86TargetLowering::useStackGuardXorFP() const {
2275 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2276 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2277}
2278
2279SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2280 const SDLoc &DL) const {
2281 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2282 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2283 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2284 return SDValue(Node, 0);
2285}
2286
2287TargetLoweringBase::LegalizeTypeAction
2288X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2289 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2290 !Subtarget.hasBWI())
2291 return TypeSplitVector;
2292
2293 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2294 VT.getVectorElementType() != MVT::i1)
2295 return TypeWidenVector;
2296
2297 return TargetLoweringBase::getPreferredVectorAction(VT);
2298}
2299
2300static std::pair<MVT, unsigned>
2301handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2302 const X86Subtarget &Subtarget) {
2303 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2304 // convention is one that uses k registers.
2305 if (NumElts == 2)
2306 return {MVT::v2i64, 1};
2307 if (NumElts == 4)
2308 return {MVT::v4i32, 1};
2309 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2310 CC != CallingConv::Intel_OCL_BI)
2311 return {MVT::v8i16, 1};
2312 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2313 CC != CallingConv::Intel_OCL_BI)
2314 return {MVT::v16i8, 1};
2315 // v32i1 passes in ymm unless we have BWI and the calling convention is
2316 // regcall.
2317 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2318 return {MVT::v32i8, 1};
2319 // Split v64i1 vectors if we don't have v64i8 available.
2320 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2321 if (Subtarget.useAVX512Regs())
2322 return {MVT::v64i8, 1};
2323 return {MVT::v32i8, 2};
2324 }
2325
2326 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2327 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2328 NumElts > 64)
2329 return {MVT::i8, NumElts};
2330
2331 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2332}
2333
2334MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2335 CallingConv::ID CC,
2336 EVT VT) const {
2337 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2338 Subtarget.hasAVX512()) {
2339 unsigned NumElts = VT.getVectorNumElements();
2340
2341 MVT RegisterVT;
2342 unsigned NumRegisters;
2343 std::tie(RegisterVT, NumRegisters) =
2344 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2345 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2346 return RegisterVT;
2347 }
2348
2349 // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
2350 // So its default register type is f16. We override the type to v8f16 here.
2351 if (VT == MVT::v3f16 && Subtarget.hasFP16())
2352 return MVT::v8f16;
2353
2354 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2355}
2356
2357unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2358 CallingConv::ID CC,
2359 EVT VT) const {
2360 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2361 Subtarget.hasAVX512()) {
2362 unsigned NumElts = VT.getVectorNumElements();
2363
2364 MVT RegisterVT;
2365 unsigned NumRegisters;
2366 std::tie(RegisterVT, NumRegisters) =
2367 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2368 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2369 return NumRegisters;
2370 }
2371
2372 // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
2373 // So its default register number is 3. We override the number to 1 here.
2374 if (VT == MVT::v3f16 && Subtarget.hasFP16())
2375 return 1;
2376
2377 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2378}
2379
2380unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2381 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2382 unsigned &NumIntermediates, MVT &RegisterVT) const {
2383 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2384 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2385 Subtarget.hasAVX512() &&
2386 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2387 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2388 VT.getVectorNumElements() > 64)) {
2389 RegisterVT = MVT::i8;
2390 IntermediateVT = MVT::i1;
2391 NumIntermediates = VT.getVectorNumElements();
2392 return NumIntermediates;
2393 }
2394
2395 // Split v64i1 vectors if we don't have v64i8 available.
2396 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2397 CC != CallingConv::X86_RegCall) {
2398 RegisterVT = MVT::v32i8;
2399 IntermediateVT = MVT::v32i1;
2400 NumIntermediates = 2;
2401 return 2;
2402 }
2403
2404 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2405 NumIntermediates, RegisterVT);
2406}
2407
2408EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2409 LLVMContext& Context,
2410 EVT VT) const {
2411 if (!VT.isVector())
2412 return MVT::i8;
2413
2414 if (Subtarget.hasAVX512()) {
2415 // Figure out what this type will be legalized to.
2416 EVT LegalVT = VT;
2417 while (getTypeAction(Context, LegalVT) != TypeLegal)
2418 LegalVT = getTypeToTransformTo(Context, LegalVT);
2419
2420 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2421 if (LegalVT.getSimpleVT().is512BitVector())
2422 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2423
2424 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2425 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2426 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2427 // vXi16/vXi8.
2428 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2429 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2430 return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());
2431 }
2432 }
2433
2434 return VT.changeVectorElementTypeToInteger();
2435}
2436
2437/// Helper for getByValTypeAlignment to determine
2438/// the desired ByVal argument alignment.
2439static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2440 if (MaxAlign == 16)
2441 return;
2442 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2443 if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2444 MaxAlign = Align(16);
2445 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2446 Align EltAlign;
2447 getMaxByValAlign(ATy->getElementType(), EltAlign);
2448 if (EltAlign > MaxAlign)
2449 MaxAlign = EltAlign;
2450 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2451 for (auto *EltTy : STy->elements()) {
2452 Align EltAlign;
2453 getMaxByValAlign(EltTy, EltAlign);
2454 if (EltAlign > MaxAlign)
2455 MaxAlign = EltAlign;
2456 if (MaxAlign == 16)
2457 break;
2458 }
2459 }
2460}
2461
2462/// Return the desired alignment for ByVal aggregate
2463/// function arguments in the caller parameter area. For X86, aggregates
2464/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2465/// are at 4-byte boundaries.
2466unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2467 const DataLayout &DL) const {
2468 if (Subtarget.is64Bit()) {
2469 // Max of 8 and alignment of type.
2470 Align TyAlign = DL.getABITypeAlign(Ty);
2471 if (TyAlign > 8)
2472 return TyAlign.value();
2473 return 8;
2474 }
2475
2476 Align Alignment(4);
2477 if (Subtarget.hasSSE1())
2478 getMaxByValAlign(Ty, Alignment);
2479 return Alignment.value();
2480}
2481
2482/// It returns EVT::Other if the type should be determined using generic
2483/// target-independent logic.
2484/// For vector ops we check that the overall size isn't larger than our
2485/// preferred vector width.
2486EVT X86TargetLowering::getOptimalMemOpType(
2487 const MemOp &Op, const AttributeList &FuncAttributes) const {
2488 if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
2489 if (Op.size() >= 16 &&
2490 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2491 // FIXME: Check if unaligned 64-byte accesses are slow.
2492 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2493 (Subtarget.getPreferVectorWidth() >= 512)) {
2494 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2495 }
2496 // FIXME: Check if unaligned 32-byte accesses are slow.
2497 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2498 (Subtarget.getPreferVectorWidth() >= 256)) {
2499 // Although this isn't a well-supported type for AVX1, we'll let
2500 // legalization and shuffle lowering produce the optimal codegen. If we
2501 // choose an optimal type with a vector element larger than a byte,
2502 // getMemsetStores() may create an intermediate splat (using an integer
2503 // multiply) before we splat as a vector.
2504 return MVT::v32i8;
2505 }
2506 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2507 return MVT::v16i8;
2508 // TODO: Can SSE1 handle a byte vector?
2509 // If we have SSE1 registers we should be able to use them.
2510 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2511 (Subtarget.getPreferVectorWidth() >= 128))
2512 return MVT::v4f32;
2513 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2514 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2515 // Do not use f64 to lower memcpy if source is string constant. It's
2516 // better to use i32 to avoid the loads.
2517 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2518 // The gymnastics of splatting a byte value into an XMM register and then
2519 // only using 8-byte stores (because this is a CPU with slow unaligned
2520 // 16-byte accesses) makes that a loser.
2521 return MVT::f64;
2522 }
2523 }
2524 // This is a compromise. If we reach here, unaligned accesses may be slow on
2525 // this target. However, creating smaller, aligned accesses could be even
2526 // slower and would certainly be a lot more code.
2527 if (Subtarget.is64Bit() && Op.size() >= 8)
2528 return MVT::i64;
2529 return MVT::i32;
2530}
2531
2532bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2533 if (VT == MVT::f32)
2534 return X86ScalarSSEf32;
2535 if (VT == MVT::f64)
2536 return X86ScalarSSEf64;
2537 return true;
2538}
2539
2540bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2541 EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2542 bool *Fast) const {
2543 if (Fast) {
2544 switch (VT.getSizeInBits()) {
2545 default:
2546 // 8-byte and under are always assumed to be fast.
2547 *Fast = true;
2548 break;
2549 case 128:
2550 *Fast = !Subtarget.isUnalignedMem16Slow();
2551 break;
2552 case 256:
2553 *Fast = !Subtarget.isUnalignedMem32Slow();
2554 break;
2555 // TODO: What about AVX-512 (512-bit) accesses?
2556 }
2557 }
2558 // NonTemporal vector memory ops must be aligned.
2559 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2560 // NT loads can only be vector aligned, so if its less aligned than the
2561 // minimum vector size (which we can split the vector down to), we might as
2562 // well use a regular unaligned vector load.
2563 // We don't have any NT loads pre-SSE41.
2564 if (!!(Flags & MachineMemOperand::MOLoad))
2565 return (Alignment < 16 || !Subtarget.hasSSE41());
2566 return false;
2567 }
2568 // Misaligned accesses of any size are always allowed.
2569 return true;
2570}
2571
2572/// Return the entry encoding for a jump table in the
2573/// current function. The returned value is a member of the
2574/// MachineJumpTableInfo::JTEntryKind enum.
2575unsigned X86TargetLowering::getJumpTableEncoding() const {
2576 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2577 // symbol.
2578 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2579 return MachineJumpTableInfo::EK_Custom32;
2580
2581 // Otherwise, use the normal jump table encoding heuristics.
2582 return TargetLowering::getJumpTableEncoding();
2583}
2584
2585bool X86TargetLowering::useSoftFloat() const {
2586 return Subtarget.useSoftFloat();
2587}
2588
2589void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2590 ArgListTy &Args) const {
2591
2592 // Only relabel X86-32 for C / Stdcall CCs.
2593 if (Subtarget.is64Bit())
2594 return;
2595 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2596 return;
2597 unsigned ParamRegs = 0;
2598 if (auto *M = MF->getFunction().getParent())
2599 ParamRegs = M->getNumberRegisterParameters();
2600
2601 // Mark the first N int arguments as having reg
2602 for (auto &Arg : Args) {
2603 Type *T = Arg.Ty;
2604 if (T->isIntOrPtrTy())
2605 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2606 unsigned numRegs = 1;
2607 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2608 numRegs = 2;
2609 if (ParamRegs < numRegs)
2610 return;
2611 ParamRegs -= numRegs;
2612 Arg.IsInReg = true;
2613 }
2614 }
2615}
2616
2617const MCExpr *
2618X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2619 const MachineBasicBlock *MBB,
2620 unsigned uid,MCContext &Ctx) const{
2621 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2621, __extension__ __PRETTY_FUNCTION__))
;
2622 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2623 // entries.
2624 return MCSymbolRefExpr::create(MBB->getSymbol(),
2625 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2626}
2627
2628/// Returns relocation base for the given PIC jumptable.
2629SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2630 SelectionDAG &DAG) const {
2631 if (!Subtarget.is64Bit())
2632 // This doesn't have SDLoc associated with it, but is not really the
2633 // same as a Register.
2634 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2635 getPointerTy(DAG.getDataLayout()));
2636 return Table;
2637}
2638
2639/// This returns the relocation base for the given PIC jumptable,
2640/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2641const MCExpr *X86TargetLowering::
2642getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2643 MCContext &Ctx) const {
2644 // X86-64 uses RIP relative addressing based on the jump table label.
2645 if (Subtarget.isPICStyleRIPRel())
2646 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2647
2648 // Otherwise, the reference is relative to the PIC base.
2649 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2650}
2651
2652std::pair<const TargetRegisterClass *, uint8_t>
2653X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2654 MVT VT) const {
2655 const TargetRegisterClass *RRC = nullptr;
2656 uint8_t Cost = 1;
2657 switch (VT.SimpleTy) {
2658 default:
2659 return TargetLowering::findRepresentativeClass(TRI, VT);
2660 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2661 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2662 break;
2663 case MVT::x86mmx:
2664 RRC = &X86::VR64RegClass;
2665 break;
2666 case MVT::f32: case MVT::f64:
2667 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2668 case MVT::v4f32: case MVT::v2f64:
2669 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2670 case MVT::v8f32: case MVT::v4f64:
2671 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2672 case MVT::v16f32: case MVT::v8f64:
2673 RRC = &X86::VR128XRegClass;
2674 break;
2675 }
2676 return std::make_pair(RRC, Cost);
2677}
2678
2679unsigned X86TargetLowering::getAddressSpace() const {
2680 if (Subtarget.is64Bit())
2681 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2682 return 256;
2683}
2684
2685static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2686 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2687 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2688}
2689
2690static Constant* SegmentOffset(IRBuilderBase &IRB,
2691 int Offset, unsigned AddressSpace) {
2692 return ConstantExpr::getIntToPtr(
2693 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2694 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2695}
2696
2697Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
2698 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2699 // tcbhead_t; use it instead of the usual global variable (see
2700 // sysdeps/{i386,x86_64}/nptl/tls.h)
2701 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2702 if (Subtarget.isTargetFuchsia()) {
2703 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2704 return SegmentOffset(IRB, 0x10, getAddressSpace());
2705 } else {
2706 unsigned AddressSpace = getAddressSpace();
2707 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
2708 // Specially, some users may customize the base reg and offset.
2709 int Offset = M->getStackProtectorGuardOffset();
2710 // If we don't set -stack-protector-guard-offset value:
2711 // %fs:0x28, unless we're using a Kernel code model, in which case
2712 // it's %gs:0x28. gs:0x14 on i386.
2713 if (Offset == INT_MAX2147483647)
2714 Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2715
2716 StringRef GuardReg = M->getStackProtectorGuardReg();
2717 if (GuardReg == "fs")
2718 AddressSpace = X86AS::FS;
2719 else if (GuardReg == "gs")
2720 AddressSpace = X86AS::GS;
2721 return SegmentOffset(IRB, Offset, AddressSpace);
2722 }
2723 }
2724 return TargetLowering::getIRStackGuard(IRB);
2725}
2726
2727void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2728 // MSVC CRT provides functionalities for stack protection.
2729 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2730 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2731 // MSVC CRT has a global variable holding security cookie.
2732 M.getOrInsertGlobal("__security_cookie",
2733 Type::getInt8PtrTy(M.getContext()));
2734
2735 // MSVC CRT has a function to validate security cookie.
2736 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2737 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2738 Type::getInt8PtrTy(M.getContext()));
2739 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2740 F->setCallingConv(CallingConv::X86_FastCall);
2741 F->addParamAttr(0, Attribute::AttrKind::InReg);
2742 }
2743 return;
2744 }
2745
2746 StringRef GuardMode = M.getStackProtectorGuard();
2747
2748 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2749 if ((GuardMode == "tls" || GuardMode.empty()) &&
2750 hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2751 return;
2752 TargetLowering::insertSSPDeclarations(M);
2753}
2754
2755Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2756 // MSVC CRT has a global variable holding security cookie.
2757 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2758 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2759 return M.getGlobalVariable("__security_cookie");
2760 }
2761 return TargetLowering::getSDagStackGuard(M);
2762}
2763
2764Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2765 // MSVC CRT has a function to validate security cookie.
2766 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2767 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2768 return M.getFunction("__security_check_cookie");
2769 }
2770 return TargetLowering::getSSPStackGuardCheck(M);
2771}
2772
2773Value *
2774X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
2775 if (Subtarget.getTargetTriple().isOSContiki())
2776 return getDefaultSafeStackPointerLocation(IRB, false);
2777
2778 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2779 // definition of TLS_SLOT_SAFESTACK in
2780 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2781 if (Subtarget.isTargetAndroid()) {
2782 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2783 // %gs:0x24 on i386
2784 int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2785 return SegmentOffset(IRB, Offset, getAddressSpace());
2786 }
2787
2788 // Fuchsia is similar.
2789 if (Subtarget.isTargetFuchsia()) {
2790 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2791 return SegmentOffset(IRB, 0x18, getAddressSpace());
2792 }
2793
2794 return TargetLowering::getSafeStackPointerLocation(IRB);
2795}
2796
2797//===----------------------------------------------------------------------===//
2798// Return Value Calling Convention Implementation
2799//===----------------------------------------------------------------------===//
2800
2801bool X86TargetLowering::CanLowerReturn(
2802 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2803 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2804 SmallVector<CCValAssign, 16> RVLocs;
2805 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2806 return CCInfo.CheckReturn(Outs, RetCC_X86);
2807}
2808
2809const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2810 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2811 return ScratchRegs;
2812}
2813
2814/// Lowers masks values (v*i1) to the local register values
2815/// \returns DAG node after lowering to register type
2816static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2817 const SDLoc &Dl, SelectionDAG &DAG) {
2818 EVT ValVT = ValArg.getValueType();
2819
2820 if (ValVT == MVT::v1i1)
2821 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2822 DAG.getIntPtrConstant(0, Dl));
2823
2824 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2825 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2826 // Two stage lowering might be required
2827 // bitcast: v8i1 -> i8 / v16i1 -> i16
2828 // anyextend: i8 -> i32 / i16 -> i32
2829 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2830 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2831 if (ValLoc == MVT::i32)
2832 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2833 return ValToCopy;
2834 }
2835
2836 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2837 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2838 // One stage lowering is required
2839 // bitcast: v32i1 -> i32 / v64i1 -> i64
2840 return DAG.getBitcast(ValLoc, ValArg);
2841 }
2842
2843 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2844}
2845
2846/// Breaks v64i1 value into two registers and adds the new node to the DAG
2847static void Passv64i1ArgInRegs(
2848 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2849 SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2850 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2851 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2851, __extension__ __PRETTY_FUNCTION__))
;
2852 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2852, __extension__ __PRETTY_FUNCTION__))
;
2853 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2853, __extension__ __PRETTY_FUNCTION__))
;
2854 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2855, __extension__ __PRETTY_FUNCTION__))
2855 "The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2855, __extension__ __PRETTY_FUNCTION__))
;
2856
2857 // Before splitting the value we cast it to i64
2858 Arg = DAG.getBitcast(MVT::i64, Arg);
2859
2860 // Splitting the value into two i32 types
2861 SDValue Lo, Hi;
2862 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2863 DAG.getConstant(0, Dl, MVT::i32));
2864 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2865 DAG.getConstant(1, Dl, MVT::i32));
2866
2867 // Attach the two i32 types into corresponding registers
2868 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2869 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2870}
2871
2872SDValue
2873X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2874 bool isVarArg,
2875 const SmallVectorImpl<ISD::OutputArg> &Outs,
2876 const SmallVectorImpl<SDValue> &OutVals,
2877 const SDLoc &dl, SelectionDAG &DAG) const {
2878 MachineFunction &MF = DAG.getMachineFunction();
2879 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2880
2881 // In some cases we need to disable registers from the default CSR list.
2882 // For example, when they are used for argument passing.
2883 bool ShouldDisableCalleeSavedRegister =
2884 CallConv == CallingConv::X86_RegCall ||
2885 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2886
2887 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2888 report_fatal_error("X86 interrupts may not return any value");
2889
2890 SmallVector<CCValAssign, 16> RVLocs;
2891 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2892 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2893
2894 SmallVector<std::pair<Register, SDValue>, 4> RetVals;
2895 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2896 ++I, ++OutsIndex) {
2897 CCValAssign &VA = RVLocs[I];
2898 assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2898, __extension__ __PRETTY_FUNCTION__))
;
2899
2900 // Add the register to the CalleeSaveDisableRegs list.
2901 if (ShouldDisableCalleeSavedRegister)
2902 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2903
2904 SDValue ValToCopy = OutVals[OutsIndex];
2905 EVT ValVT = ValToCopy.getValueType();
2906
2907 // Promote values to the appropriate types.
2908 if (VA.getLocInfo() == CCValAssign::SExt)
2909 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2910 else if (VA.getLocInfo() == CCValAssign::ZExt)
2911 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2912 else if (VA.getLocInfo() == CCValAssign::AExt) {
2913 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2914 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2915 else
2916 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2917 }
2918 else if (VA.getLocInfo() == CCValAssign::BCvt)
2919 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2920
2921 assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2922, __extension__ __PRETTY_FUNCTION__))
2922 "Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2922, __extension__ __PRETTY_FUNCTION__))
;
2923
2924 // Report an error if we have attempted to return a value via an XMM
2925 // register and SSE was disabled.
2926 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2927 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2928 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2929 } else if (!Subtarget.hasSSE2() &&
2930 X86::FR64XRegClass.contains(VA.getLocReg()) &&
2931 ValVT == MVT::f64) {
2932 // When returning a double via an XMM register, report an error if SSE2 is
2933 // not enabled.
2934 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2935 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2936 }
2937
2938 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2939 // the RET instruction and handled by the FP Stackifier.
2940 if (VA.getLocReg() == X86::FP0 ||
2941 VA.getLocReg() == X86::FP1) {
2942 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2943 // change the value to the FP stack register class.
2944 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2945 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2946 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2947 // Don't emit a copytoreg.
2948 continue;
2949 }
2950
2951 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2952 // which is returned in RAX / RDX.
2953 if (Subtarget.is64Bit()) {
2954 if (ValVT == MVT::x86mmx) {
2955 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2956 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2957 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2958 ValToCopy);
2959 // If we don't have SSE2 available, convert to v4f32 so the generated
2960 // register is legal.
2961 if (!Subtarget.hasSSE2())
2962 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2963 }
2964 }
2965 }
2966
2967 if (VA.needsCustom()) {
2968 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2969, __extension__ __PRETTY_FUNCTION__))
2969 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2969, __extension__ __PRETTY_FUNCTION__))
;
2970
2971 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2972 Subtarget);
2973
2974 // Add the second register to the CalleeSaveDisableRegs list.
2975 if (ShouldDisableCalleeSavedRegister)
2976 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2977 } else {
2978 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2979 }
2980 }
2981
2982 SDValue Flag;
2983 SmallVector<SDValue, 6> RetOps;
2984 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2985 // Operand #1 = Bytes To Pop
2986 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2987 MVT::i32));
2988
2989 // Copy the result values into the output registers.
2990 for (auto &RetVal : RetVals) {
2991 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2992 RetOps.push_back(RetVal.second);
2993 continue; // Don't emit a copytoreg.
2994 }
2995
2996 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2997 Flag = Chain.getValue(1);
2998 RetOps.push_back(
2999 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
3000 }
3001
3002 // Swift calling convention does not require we copy the sret argument
3003 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
3004
3005 // All x86 ABIs require that for returning structs by value we copy
3006 // the sret argument into %rax/%eax (depending on ABI) for the return.
3007 // We saved the argument into a virtual register in the entry block,
3008 // so now we copy the value out and into %rax/%eax.
3009 //
3010 // Checking Function.hasStructRetAttr() here is insufficient because the IR
3011 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
3012 // false, then an sret argument may be implicitly inserted in the SelDAG. In
3013 // either case FuncInfo->setSRetReturnReg() will have been called.
3014 if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
3015 // When we have both sret and another return value, we should use the
3016 // original Chain stored in RetOps[0], instead of the current Chain updated
3017 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
3018
3019 // For the case of sret and another return value, we have
3020 // Chain_0 at the function entry
3021 // Chain_1 = getCopyToReg(Chain_0) in the above loop
3022 // If we use Chain_1 in getCopyFromReg, we will have
3023 // Val = getCopyFromReg(Chain_1)
3024 // Chain_2 = getCopyToReg(Chain_1, Val) from below
3025
3026 // getCopyToReg(Chain_0) will be glued together with
3027 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
3028 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
3029 // Data dependency from Unit B to Unit A due to usage of Val in
3030 // getCopyToReg(Chain_1, Val)
3031 // Chain dependency from Unit A to Unit B
3032
3033 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
3034 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
3035 getPointerTy(MF.getDataLayout()));
3036
3037 Register RetValReg
3038 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
3039 X86::RAX : X86::EAX;
3040 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
3041 Flag = Chain.getValue(1);
3042
3043 // RAX/EAX now acts like a return value.
3044 RetOps.push_back(
3045 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
3046
3047 // Add the returned register to the CalleeSaveDisableRegs list.
3048 if (ShouldDisableCalleeSavedRegister)
3049 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
3050 }
3051
3052 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3053 const MCPhysReg *I =
3054 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3055 if (I) {
3056 for (; *I; ++I) {
3057 if (X86::GR64RegClass.contains(*I))
3058 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3059 else
3060 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3060)
;
3061 }
3062 }
3063
3064 RetOps[0] = Chain; // Update chain.
3065
3066 // Add the flag if we have it.
3067 if (Flag.getNode())
3068 RetOps.push_back(Flag);
3069
3070 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
3071 if (CallConv == CallingConv::X86_INTR)
3072 opcode = X86ISD::IRET;
3073 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
3074}
3075
3076bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3077 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
3078 return false;
3079
3080 SDValue TCChain = Chain;
3081 SDNode *Copy = *N->use_begin();
3082 if (Copy->getOpcode() == ISD::CopyToReg) {
3083 // If the copy has a glue operand, we conservatively assume it isn't safe to
3084 // perform a tail call.
3085 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3086 return false;
3087 TCChain = Copy->getOperand(0);
3088 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
3089 return false;
3090
3091 bool HasRet = false;
3092 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
3093 UI != UE; ++UI) {
3094 if (UI->getOpcode() != X86ISD::RET_FLAG)
3095 return false;
3096 // If we are returning more than one value, we can definitely
3097 // not make a tail call see PR19530
3098 if (UI->getNumOperands() > 4)
3099 return false;
3100 if (UI->getNumOperands() == 4 &&
3101 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
3102 return false;
3103 HasRet = true;
3104 }
3105
3106 if (!HasRet)
3107 return false;
3108
3109 Chain = TCChain;
3110 return true;
3111}
3112
3113EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
3114 ISD::NodeType ExtendKind) const {
3115 MVT ReturnMVT = MVT::i32;
3116
3117 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
3118 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
3119 // The ABI does not require i1, i8 or i16 to be extended.
3120 //
3121 // On Darwin, there is code in the wild relying on Clang's old behaviour of
3122 // always extending i8/i16 return values, so keep doing that for now.
3123 // (PR26665).
3124 ReturnMVT = MVT::i8;
3125 }
3126
3127 EVT MinVT = getRegisterType(Context, ReturnMVT);
3128 return VT.bitsLT(MinVT) ? MinVT : VT;
3129}
3130
3131/// Reads two 32 bit registers and creates a 64 bit mask value.
3132/// \param VA The current 32 bit value that need to be assigned.
3133/// \param NextVA The next 32 bit value that need to be assigned.
3134/// \param Root The parent DAG node.
3135/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
3136/// glue purposes. In the case the DAG is already using
3137/// physical register instead of virtual, we should glue
3138/// our new SDValue to InFlag SDvalue.
3139/// \return a new SDvalue of size 64bit.
3140static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
3141 SDValue &Root, SelectionDAG &DAG,
3142 const SDLoc &Dl, const X86Subtarget &Subtarget,
3143 SDValue *InFlag = nullptr) {
3144 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3144, __extension__ __PRETTY_FUNCTION__))
;
3145 assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3145, __extension__ __PRETTY_FUNCTION__))
;
3146 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3147, __extension__ __PRETTY_FUNCTION__))
3147 "Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3147, __extension__ __PRETTY_FUNCTION__))
;
3148 assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3149, __extension__ __PRETTY_FUNCTION__))
3149 "The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3149, __extension__ __PRETTY_FUNCTION__))
;
3150 assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3151, __extension__ __PRETTY_FUNCTION__))
3151 "The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3151, __extension__ __PRETTY_FUNCTION__))
;
3152
3153 SDValue Lo, Hi;
3154 SDValue ArgValueLo, ArgValueHi;
3155
3156 MachineFunction &MF = DAG.getMachineFunction();
3157 const TargetRegisterClass *RC = &X86::GR32RegClass;
3158
3159 // Read a 32 bit value from the registers.
3160 if (nullptr == InFlag) {
3161 // When no physical register is present,
3162 // create an intermediate virtual register.
3163 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3164 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3165 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
3166 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
3167 } else {
3168 // When a physical register is available read the value from it and glue
3169 // the reads together.
3170 ArgValueLo =
3171 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
3172 *InFlag = ArgValueLo.getValue(2);
3173 ArgValueHi =
3174 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
3175 *InFlag = ArgValueHi.getValue(2);
3176 }
3177
3178 // Convert the i32 type into v32i1 type.
3179 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3180
3181 // Convert the i32 type into v32i1 type.
3182 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3183
3184 // Concatenate the two values together.
3185 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3186}
3187
3188/// The function will lower a register of various sizes (8/16/32/64)
3189/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3190/// \returns a DAG node contains the operand after lowering to mask type.
3191static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3192 const EVT &ValLoc, const SDLoc &Dl,
3193 SelectionDAG &DAG) {
3194 SDValue ValReturned = ValArg;
3195
3196 if (ValVT == MVT::v1i1)
3197 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3198
3199 if (ValVT == MVT::v64i1) {
3200 // In 32 bit machine, this case is handled by getv64i1Argument
3201 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3201, __extension__ __PRETTY_FUNCTION__))
;
3202 // In 64 bit machine, There is no need to truncate the value only bitcast
3203 } else {
3204 MVT maskLen;
3205 switch (ValVT.getSimpleVT().SimpleTy) {
3206 case MVT::v8i1:
3207 maskLen = MVT::i8;
3208 break;
3209 case MVT::v16i1:
3210 maskLen = MVT::i16;
3211 break;
3212 case MVT::v32i1:
3213 maskLen = MVT::i32;
3214 break;
3215 default:
3216 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3216)
;
3217 }
3218
3219 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3220 }
3221 return DAG.getBitcast(ValVT, ValReturned);
3222}
3223
3224/// Lower the result values of a call into the
3225/// appropriate copies out of appropriate physical registers.
3226///
3227SDValue X86TargetLowering::LowerCallResult(
3228 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3229 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3230 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3231 uint32_t *RegMask) const {
3232
3233 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3234 // Assign locations to each value returned by this call.
3235 SmallVector<CCValAssign, 16> RVLocs;
3236 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3237 *DAG.getContext());
3238 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3239
3240 // Copy all of the result registers out of their specified physreg.
3241 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3242 ++I, ++InsIndex) {
3243 CCValAssign &VA = RVLocs[I];
3244 EVT CopyVT = VA.getLocVT();
3245
3246 // In some calling conventions we need to remove the used registers
3247 // from the register mask.
3248 if (RegMask) {
3249 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3250 SubRegs.isValid(); ++SubRegs)
3251 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3252 }
3253
3254 // Report an error if there was an attempt to return FP values via XMM
3255 // registers.
3256 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3257 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3258 if (VA.getLocReg() == X86::XMM1)
3259 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3260 else
3261 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3262 } else if (!Subtarget.hasSSE2() &&
3263 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3264 CopyVT == MVT::f64) {
3265 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3266 if (VA.getLocReg() == X86::XMM1)
3267 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3268 else
3269 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3270 }
3271
3272 // If we prefer to use the value in xmm registers, copy it out as f80 and
3273 // use a truncate to move it from fp stack reg to xmm reg.
3274 bool RoundAfterCopy = false;
3275 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3276 isScalarFPTypeInSSEReg(VA.getValVT())) {
3277 if (!Subtarget.hasX87())
3278 report_fatal_error("X87 register return with X87 disabled");
3279 CopyVT = MVT::f80;
3280 RoundAfterCopy = (CopyVT != VA.getLocVT());
3281 }
3282
3283 SDValue Val;
3284 if (VA.needsCustom()) {
3285 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3286, __extension__ __PRETTY_FUNCTION__))
3286 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3286, __extension__ __PRETTY_FUNCTION__))
;
3287 Val =
3288 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3289 } else {
3290 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3291 .getValue(1);
3292 Val = Chain.getValue(0);
3293 InFlag = Chain.getValue(2);
3294 }
3295
3296 if (RoundAfterCopy)
3297 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3298 // This truncation won't change the value.
3299 DAG.getIntPtrConstant(1, dl));
3300
3301 if (VA.isExtInLoc()) {
3302 if (VA.getValVT().isVector() &&
3303 VA.getValVT().getScalarType() == MVT::i1 &&
3304 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3305 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3306 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3307 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3308 } else
3309 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3310 }
3311
3312 if (VA.getLocInfo() == CCValAssign::BCvt)
3313 Val = DAG.getBitcast(VA.getValVT(), Val);
3314
3315 InVals.push_back(Val);
3316 }
3317
3318 return Chain;
3319}
3320
3321//===----------------------------------------------------------------------===//
3322// C & StdCall & Fast Calling Convention implementation
3323//===----------------------------------------------------------------------===//
3324// StdCall calling convention seems to be standard for many Windows' API
3325// routines and around. It differs from C calling convention just a little:
3326// callee should clean up the stack, not caller. Symbols should be also
3327// decorated in some fancy way :) It doesn't support any vector arguments.
3328// For info on fast calling convention see Fast Calling Convention (tail call)
3329// implementation LowerX86_32FastCCCallTo.
3330
3331/// CallIsStructReturn - Determines whether a call uses struct return
3332/// semantics.
3333enum StructReturnType {
3334 NotStructReturn,
3335 RegStructReturn,
3336 StackStructReturn
3337};
3338static StructReturnType
3339callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3340 if (Outs.empty())
3341 return NotStructReturn;
3342
3343 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3344 if (!Flags.isSRet())
3345 return NotStructReturn;
3346 if (Flags.isInReg() || IsMCU)
3347 return RegStructReturn;
3348 return StackStructReturn;
3349}
3350
3351/// Determines whether a function uses struct return semantics.
3352static StructReturnType
3353argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3354 if (Ins.empty())
3355 return NotStructReturn;
3356
3357 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3358 if (!Flags.isSRet())
3359 return NotStructReturn;
3360 if (Flags.isInReg() || IsMCU)
3361 return RegStructReturn;
3362 return StackStructReturn;
3363}
3364
3365/// Make a copy of an aggregate at address specified by "Src" to address
3366/// "Dst" with size and alignment information specified by the specific
3367/// parameter attribute. The copy will be passed as a byval function parameter.
3368static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3369 SDValue Chain, ISD::ArgFlagsTy Flags,
3370 SelectionDAG &DAG, const SDLoc &dl) {
3371 SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3372
3373 return DAG.getMemcpy(
3374 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3375 /*isVolatile*/ false, /*AlwaysInline=*/true,
3376 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3377}
3378
3379/// Return true if the calling convention is one that we can guarantee TCO for.
3380static bool canGuaranteeTCO(CallingConv::ID CC) {
3381 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3382 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3383 CC == CallingConv::HHVM || CC == CallingConv::Tail ||
3384 CC == CallingConv::SwiftTail);
3385}
3386
3387/// Return true if we might ever do TCO for calls with this calling convention.
3388static bool mayTailCallThisCC(CallingConv::ID CC) {
3389 switch (CC) {
3390 // C calling conventions:
3391 case CallingConv::C:
3392 case CallingConv::Win64:
3393 case CallingConv::X86_64_SysV:
3394 // Callee pop conventions:
3395 case CallingConv::X86_ThisCall:
3396 case CallingConv::X86_StdCall:
3397 case CallingConv::X86_VectorCall:
3398 case CallingConv::X86_FastCall:
3399 // Swift:
3400 case CallingConv::Swift:
3401 return true;
3402 default:
3403 return canGuaranteeTCO(CC);
3404 }
3405}
3406
3407/// Return true if the function is being made into a tailcall target by
3408/// changing its ABI.
3409static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3410 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||
3411 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
3412}
3413
3414bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3415 if (!CI->isTailCall())
3416 return false;
3417
3418 CallingConv::ID CalleeCC = CI->getCallingConv();
3419 if (!mayTailCallThisCC(CalleeCC))
3420 return false;
3421
3422 return true;
3423}
3424
3425SDValue
3426X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3427 const SmallVectorImpl<ISD::InputArg> &Ins,
3428 const SDLoc &dl, SelectionDAG &DAG,
3429 const CCValAssign &VA,
3430 MachineFrameInfo &MFI, unsigned i) const {
3431 // Create the nodes corresponding to a load from this parameter slot.
3432 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3433 bool AlwaysUseMutable = shouldGuaranteeTCO(
3434 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3435 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3436 EVT ValVT;
3437 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3438
3439 // If value is passed by pointer we have address passed instead of the value
3440 // itself. No need to extend if the mask value and location share the same
3441 // absolute size.
3442 bool ExtendedInMem =
3443 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3444 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3445
3446 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3447 ValVT = VA.getLocVT();
3448 else
3449 ValVT = VA.getValVT();
3450
3451 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3452 // changed with more analysis.
3453 // In case of tail call optimization mark all arguments mutable. Since they
3454 // could be overwritten by lowering of arguments in case of a tail call.
3455 if (Flags.isByVal()) {
3456 unsigned Bytes = Flags.getByValSize();
3457 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3458
3459 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3460 // can be improved with deeper analysis.
3461 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3462 /*isAliased=*/true);
3463 return DAG.getFrameIndex(FI, PtrVT);
3464 }
3465
3466 EVT ArgVT = Ins[i].ArgVT;
3467
3468 // If this is a vector that has been split into multiple parts, and the
3469 // scalar size of the parts don't match the vector element size, then we can't
3470 // elide the copy. The parts will have padding between them instead of being
3471 // packed like a vector.
3472 bool ScalarizedAndExtendedVector =
3473 ArgVT.isVector() && !VA.getLocVT().isVector() &&
3474 VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();
3475
3476 // This is an argument in memory. We might be able to perform copy elision.
3477 // If the argument is passed directly in memory without any extension, then we
3478 // can perform copy elision. Large vector types, for example, may be passed
3479 // indirectly by pointer.
3480 if (Flags.isCopyElisionCandidate() &&
3481 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&
3482 !ScalarizedAndExtendedVector) {
3483 SDValue PartAddr;
3484 if (Ins[i].PartOffset == 0) {
3485 // If this is a one-part value or the first part of a multi-part value,
3486 // create a stack object for the entire argument value type and return a
3487 // load from our portion of it. This assumes that if the first part of an
3488 // argument is in memory, the rest will also be in memory.
3489 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3490 /*IsImmutable=*/false);
3491 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3492 return DAG.getLoad(
3493 ValVT, dl, Chain, PartAddr,
3494 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3495 } else {
3496 // This is not the first piece of an argument in memory. See if there is
3497 // already a fixed stack object including this offset. If so, assume it
3498 // was created by the PartOffset == 0 branch above and create a load from
3499 // the appropriate offset into it.
3500 int64_t PartBegin = VA.getLocMemOffset();
3501 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3502 int FI = MFI.getObjectIndexBegin();
3503 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3504 int64_t ObjBegin = MFI.getObjectOffset(FI);
3505 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3506 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3507 break;
3508 }
3509 if (MFI.isFixedObjectIndex(FI)) {
3510 SDValue Addr =
3511 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3512 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3513 return DAG.getLoad(
3514 ValVT, dl, Chain, Addr,
3515 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3516 Ins[i].PartOffset));
3517 }
3518 }
3519 }
3520
3521 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3522 VA.getLocMemOffset(), isImmutable);
3523
3524 // Set SExt or ZExt flag.
3525 if (VA.getLocInfo() == CCValAssign::ZExt) {
3526 MFI.setObjectZExt(FI, true);
3527 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3528 MFI.setObjectSExt(FI, true);
3529 }
3530
3531 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3532 SDValue Val = DAG.getLoad(
3533 ValVT, dl, Chain, FIN,
3534 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3535 return ExtendedInMem
3536 ? (VA.getValVT().isVector()
3537 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3538 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3539 : Val;
3540}
3541
3542// FIXME: Get this from tablegen.
3543static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3544 const X86Subtarget &Subtarget) {
3545 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3545, __extension__ __PRETTY_FUNCTION__))
;
3546
3547 if (Subtarget.isCallingConvWin64(CallConv)) {
3548 static const MCPhysReg GPR64ArgRegsWin64[] = {
3549 X86::RCX, X86::RDX, X86::R8, X86::R9
3550 };
3551 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3552 }
3553
3554 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3555 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3556 };
3557 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3558}
3559
3560// FIXME: Get this from tablegen.
3561static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3562 CallingConv::ID CallConv,
3563 const X86Subtarget &Subtarget) {
3564 assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3564, __extension__ __PRETTY_FUNCTION__))
;
3565 if (Subtarget.isCallingConvWin64(CallConv)) {
3566 // The XMM registers which might contain var arg parameters are shadowed
3567 // in their paired GPR. So we only need to save the GPR to their home
3568 // slots.
3569 // TODO: __vectorcall will change this.
3570 return None;
3571 }
3572
3573 bool isSoftFloat = Subtarget.useSoftFloat();
3574 if (isSoftFloat || !Subtarget.hasSSE1())
3575 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3576 // registers.
3577 return None;
3578
3579 static const MCPhysReg XMMArgRegs64Bit[] = {
3580 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3581 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3582 };
3583 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3584}
3585
3586#ifndef NDEBUG
3587static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3588 return llvm::is_sorted(
3589 ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
3590 return A.getValNo() < B.getValNo();
3591 });
3592}
3593#endif
3594
3595namespace {
3596/// This is a helper class for lowering variable arguments parameters.
3597class VarArgsLoweringHelper {
3598public:
3599 VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
3600 SelectionDAG &DAG, const X86Subtarget &Subtarget,
3601 CallingConv::ID CallConv, CCState &CCInfo)
3602 : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
3603 TheMachineFunction(DAG.getMachineFunction()),
3604 TheFunction(TheMachineFunction.getFunction()),
3605 FrameInfo(TheMachineFunction.getFrameInfo()),
3606 FrameLowering(*Subtarget.getFrameLowering()),
3607 TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
3608 CCInfo(CCInfo) {}
3609
3610 // Lower variable arguments parameters.
3611 void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
3612
3613private:
3614 void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
3615
3616 void forwardMustTailParameters(SDValue &Chain);
3617
3618 bool is64Bit() const { return Subtarget.is64Bit(); }
3619 bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }
3620
3621 X86MachineFunctionInfo *FuncInfo;
3622 const SDLoc &DL;
3623 SelectionDAG &DAG;
3624 const X86Subtarget &Subtarget;
3625 MachineFunction &TheMachineFunction;
3626 const Function &TheFunction;
3627 MachineFrameInfo &FrameInfo;
3628 const TargetFrameLowering &FrameLowering;
3629 const TargetLowering &TargLowering;
3630 CallingConv::ID CallConv;
3631 CCState &CCInfo;
3632};
3633} // namespace
3634
3635void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
3636 SDValue &Chain, unsigned StackSize) {
3637 // If the function takes variable number of arguments, make a frame index for
3638 // the start of the first vararg value... for expansion of llvm.va_start. We
3639 // can skip this if there are no va_start calls.
3640 if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
3641 CallConv != CallingConv::X86_ThisCall)) {
3642 FuncInfo->setVarArgsFrameIndex(
3643 FrameInfo.CreateFixedObject(1, StackSize, true));
3644 }
3645
3646 // 64-bit calling conventions support varargs and register parameters, so we
3647 // have to do extra work to spill them in the prologue.
3648 if (is64Bit()) {
3649 // Find the first unallocated argument registers.
3650 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3651 ArrayRef<MCPhysReg> ArgXMMs =
3652 get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
3653 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3654 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3655
3656 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3657, __extension__ __PRETTY_FUNCTION__))
3657 "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3657, __extension__ __PRETTY_FUNCTION__))
;
3658
3659 if (isWin64()) {
3660 // Get to the caller-allocated home save location. Add 8 to account
3661 // for the return address.
3662 int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
3663 FuncInfo->setRegSaveFrameIndex(
3664 FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3665 // Fixup to set vararg frame on shadow area (4 x i64).
3666 if (NumIntRegs < 4)
3667 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3668 } else {
3669 // For X86-64, if there are vararg parameters that are passed via
3670 // registers, then we must store them to their spots on the stack so
3671 // they may be loaded by dereferencing the result of va_next.
3672 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3673 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3674 FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
3675 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
3676 }
3677
3678 SmallVector<SDValue, 6>
3679 LiveGPRs; // list of SDValue for GPR registers keeping live input value
3680 SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
3681 // keeping live input value
3682 SDValue ALVal; // if applicable keeps SDValue for %al register
3683
3684 // Gather all the live in physical registers.
3685 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3686 Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
3687 LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
3688 }
3689 const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
3690 if (!AvailableXmms.empty()) {
3691 Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3692 ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
3693 for (MCPhysReg Reg : AvailableXmms) {
3694 // FastRegisterAllocator spills virtual registers at basic
3695 // block boundary. That leads to usages of xmm registers
3696 // outside of check for %al. Pass physical registers to
3697 // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.
3698 TheMachineFunction.getRegInfo().addLiveIn(Reg);
3699 LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));
3700 }
3701 }
3702
3703 // Store the integer parameter registers.
3704 SmallVector<SDValue, 8> MemOps;
3705 SDValue RSFIN =
3706 DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3707 TargLowering.getPointerTy(DAG.getDataLayout()));
3708 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3709 for (SDValue Val : LiveGPRs) {
3710 SDValue FIN = DAG.getNode(ISD::ADD, DL,
3711 TargLowering.getPointerTy(DAG.getDataLayout()),
3712 RSFIN, DAG.getIntPtrConstant(Offset, DL));
3713 SDValue Store =
3714 DAG.getStore(Val.getValue(1), DL, Val, FIN,
3715 MachinePointerInfo::getFixedStack(
3716 DAG.getMachineFunction(),
3717 FuncInfo->getRegSaveFrameIndex(), Offset));
3718 MemOps.push_back(Store);
3719 Offset += 8;
3720 }
3721
3722 // Now store the XMM (fp + vector) parameter registers.
3723 if (!LiveXMMRegs.empty()) {
3724 SmallVector<SDValue, 12> SaveXMMOps;
3725 SaveXMMOps.push_back(Chain);
3726 SaveXMMOps.push_back(ALVal);
3727 SaveXMMOps.push_back(
3728 DAG.getTargetConstant(FuncInfo->getRegSaveFrameIndex(), DL, MVT::i32));
3729 SaveXMMOps.push_back(
3730 DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));
3731 llvm::append_range(SaveXMMOps, LiveXMMRegs);
3732 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
3733 MVT::Other, SaveXMMOps));
3734 }
3735
3736 if (!MemOps.empty())
3737 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3738 }
3739}
3740
3741void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
3742 // Find the largest legal vector type.
3743 MVT VecVT = MVT::Other;
3744 // FIXME: Only some x86_32 calling conventions support AVX512.
3745 if (Subtarget.useAVX512Regs() &&
3746 (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
3747 CallConv == CallingConv::Intel_OCL_BI)))
3748 VecVT = MVT::v16f32;
3749 else if (Subtarget.hasAVX())
3750 VecVT = MVT::v8f32;
3751 else if (Subtarget.hasSSE2())
3752 VecVT = MVT::v4f32;
3753
3754 // We forward some GPRs and some vector types.
3755 SmallVector<MVT, 2> RegParmTypes;
3756 MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
3757 RegParmTypes.push_back(IntVT);
3758 if (VecVT != MVT::Other)
3759 RegParmTypes.push_back(VecVT);
3760
3761 // Compute the set of forwarded registers. The rest are scratch.
3762 SmallVectorImpl<ForwardedRegister> &Forwards =
3763 FuncInfo->getForwardedMustTailRegParms();
3764 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3765
3766 // Forward AL for SysV x86_64 targets, since it is used for varargs.
3767 if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
3768 Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
3769 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3770 }
3771
3772 // Copy all forwards from physical to virtual registers.
3773 for (ForwardedRegister &FR : Forwards) {
3774 // FIXME: Can we use a less constrained schedule?
3775 SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
3776 FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
3777 TargLowering.getRegClassFor(FR.VT));
3778 Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
3779 }
3780}
3781
3782void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
3783 unsigned StackSize) {
3784 // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
3785 // If necessary, it would be set into the correct value later.
3786 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3787 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3788
3789 if (FrameInfo.hasVAStart())
3790 createVarArgAreaAndStoreRegisters(Chain, StackSize);
3791
3792 if (FrameInfo.hasMustTailInVarArgFunc())
3793 forwardMustTailParameters(Chain);
3794}
3795
3796SDValue X86TargetLowering::LowerFormalArguments(
3797 SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
3798 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3799 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3800 MachineFunction &MF = DAG.getMachineFunction();
3801 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3802
3803 const Function &F = MF.getFunction();
3804 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3805 F.getName() == "main")
3806 FuncInfo->setForceFramePointer(true);
3807
3808 MachineFrameInfo &MFI = MF.getFrameInfo();
3809 bool Is64Bit = Subtarget.is64Bit();
3810 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3811
3812 assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3814, __extension__ __PRETTY_FUNCTION__))
3813 !(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3814, __extension__ __PRETTY_FUNCTION__))
3814 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3814, __extension__ __PRETTY_FUNCTION__))
;
3815
3816 // Assign locations to all of the incoming arguments.
3817 SmallVector<CCValAssign, 16> ArgLocs;
3818 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3819
3820 // Allocate shadow area for Win64.
3821 if (IsWin64)
3822 CCInfo.AllocateStack(32, Align(8));
3823
3824 CCInfo.AnalyzeArguments(Ins, CC_X86);
3825
3826 // In vectorcall calling convention a second pass is required for the HVA
3827 // types.
3828 if (CallingConv::X86_VectorCall == CallConv) {
3829 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3830 }
3831
3832 // The next loop assumes that the locations are in the same order of the
3833 // input arguments.
3834 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3835, __extension__ __PRETTY_FUNCTION__))
3835 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3835, __extension__ __PRETTY_FUNCTION__))
;
3836
3837 SDValue ArgValue;
3838 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3839 ++I, ++InsIndex) {
3840 assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3840, __extension__ __PRETTY_FUNCTION__))
;
3841 CCValAssign &VA = ArgLocs[I];
3842
3843 if (VA.isRegLoc()) {
3844 EVT RegVT = VA.getLocVT();
3845 if (VA.needsCustom()) {
3846 assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3848, __extension__ __PRETTY_FUNCTION__))
3847 VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3848, __extension__ __PRETTY_FUNCTION__))
3848 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3848, __extension__ __PRETTY_FUNCTION__))
;
3849
3850 // v64i1 values, in regcall calling convention, that are
3851 // compiled to 32 bit arch, are split up into two registers.
3852 ArgValue =
3853 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3854 } else {
3855 const TargetRegisterClass *RC;
3856 if (RegVT == MVT::i8)
3857 RC = &X86::GR8RegClass;
3858 else if (RegVT == MVT::i16)
3859 RC = &X86::GR16RegClass;
3860 else if (RegVT == MVT::i32)
3861 RC = &X86::GR32RegClass;
3862 else if (Is64Bit && RegVT == MVT::i64)
3863 RC = &X86::GR64RegClass;
3864 else if (RegVT == MVT::f16)
3865 RC = &X86::FR16XRegClass;
3866 else if (RegVT == MVT::f32)
3867 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3868 else if (RegVT == MVT::f64)
3869 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3870 else if (RegVT == MVT::f80)
3871 RC = &X86::RFP80RegClass;
3872 else if (RegVT == MVT::f128)
3873 RC = &X86::VR128RegClass;
3874 else if (RegVT.is512BitVector())
3875 RC = &X86::VR512RegClass;
3876 else if (RegVT.is256BitVector())
3877 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3878 else if (RegVT.is128BitVector())
3879 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3880 else if (RegVT == MVT::x86mmx)
3881 RC = &X86::VR64RegClass;
3882 else if (RegVT == MVT::v1i1)
3883 RC = &X86::VK1RegClass;
3884 else if (RegVT == MVT::v8i1)
3885 RC = &X86::VK8RegClass;
3886 else if (RegVT == MVT::v16i1)
3887 RC = &X86::VK16RegClass;
3888 else if (RegVT == MVT::v32i1)
3889 RC = &X86::VK32RegClass;
3890 else if (RegVT == MVT::v64i1)
3891 RC = &X86::VK64RegClass;
3892 else
3893 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3893)
;
3894
3895 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
3896 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3897 }
3898
3899 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3900 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3901 // right size.
3902 if (VA.getLocInfo() == CCValAssign::SExt)
3903 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3904 DAG.getValueType(VA.getValVT()));
3905 else if (VA.getLocInfo() == CCValAssign::ZExt)
3906 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3907 DAG.getValueType(VA.getValVT()));
3908 else if (VA.getLocInfo() == CCValAssign::BCvt)
3909 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3910
3911 if (VA.isExtInLoc()) {
3912 // Handle MMX values passed in XMM regs.
3913 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3914 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3915 else if (VA.getValVT().isVector() &&
3916 VA.getValVT().getScalarType() == MVT::i1 &&
3917 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3918 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3919 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3920 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3921 } else
3922 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3923 }
3924 } else {
3925 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3925, __extension__ __PRETTY_FUNCTION__))
;
3926 ArgValue =
3927 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3928 }
3929
3930 // If value is passed via pointer - do a load.
3931 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3932 ArgValue =
3933 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3934
3935 InVals.push_back(ArgValue);
3936 }
3937
3938 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3939 if (Ins[I].Flags.isSwiftAsync()) {
3940 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
3941 if (Subtarget.is64Bit())
3942 X86FI->setHasSwiftAsyncContext(true);
3943 else {
3944 int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
3945 X86FI->setSwiftAsyncContextFrameIdx(FI);
3946 SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],
3947 DAG.getFrameIndex(FI, MVT::i32),
3948 MachinePointerInfo::getFixedStack(MF, FI));
3949 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);
3950 }
3951 }
3952
3953 // Swift calling convention does not require we copy the sret argument
3954 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3955 if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)
3956 continue;
3957
3958 // All x86 ABIs require that for returning structs by value we copy the
3959 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3960 // the argument into a virtual register so that we can access it from the
3961 // return points.
3962 if (Ins[I].Flags.isSRet()) {
3963 assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3964, __extension__ __PRETTY_FUNCTION__))
3964 "SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3964, __extension__ __PRETTY_FUNCTION__))
;
3965 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3966 Register Reg =
3967 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3968 FuncInfo->setSRetReturnReg(Reg);
3969 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3970 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3971 break;
3972 }
3973 }
3974
3975 unsigned StackSize = CCInfo.getNextStackOffset();
3976 // Align stack specially for tail calls.
3977 if (shouldGuaranteeTCO(CallConv,
3978 MF.getTarget().Options.GuaranteedTailCallOpt))
3979 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3980
3981 if (IsVarArg)
3982 VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
3983 .lowerVarArgsParameters(Chain, StackSize);
3984
3985 // Some CCs need callee pop.
3986 if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
3987 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3988 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3989 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3990 // X86 interrupts must pop the error code (and the alignment padding) if
3991 // present.
3992 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3993 } else {
3994 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3995 // If this is an sret function, the return should pop the hidden pointer.
3996 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3997 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3998 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3999 FuncInfo->setBytesToPopOnReturn(4);
4000 }
4001
4002 if (!Is64Bit) {
4003 // RegSaveFrameIndex is X86-64 only.
4004 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
4005 }
4006
4007 FuncInfo->setArgumentStackSize(StackSize);
4008
4009 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
4010 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
4011 if (Personality == EHPersonality::CoreCLR) {
4012 assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4012, __extension__ __PRETTY_FUNCTION__))
;
4013 // TODO: Add a mechanism to frame lowering that will allow us to indicate
4014 // that we'd prefer this slot be allocated towards the bottom of the frame
4015 // (i.e. near the stack pointer after allocating the frame). Every
4016 // funclet needs a copy of this slot in its (mostly empty) frame, and the
4017 // offset from the bottom of this and each funclet's frame must be the
4018 // same, so the size of funclets' (mostly empty) frames is dictated by
4019 // how far this slot is from the bottom (since they allocate just enough
4020 // space to accommodate holding this slot at the correct offset).
4021 int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);
4022 EHInfo->PSPSymFrameIdx = PSPSymFI;
4023 }
4024 }
4025
4026 if (CallConv == CallingConv::X86_RegCall ||
4027 F.hasFnAttribute("no_caller_saved_registers")) {
4028 MachineRegisterInfo &MRI = MF.getRegInfo();
4029 for (std::pair<Register, Register> Pair : MRI.liveins())
4030 MRI.disableCalleeSavedRegister(Pair.first);
4031 }
4032
4033 return Chain;
4034}
4035
4036SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
4037 SDValue Arg, const SDLoc &dl,
4038 SelectionDAG &DAG,
4039 const CCValAssign &VA,
4040 ISD::ArgFlagsTy Flags,
4041 bool isByVal) const {
4042 unsigned LocMemOffset = VA.getLocMemOffset();
4043 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
4044 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4045 StackPtr, PtrOff);
4046 if (isByVal)
4047 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
4048
4049 return DAG.getStore(
4050 Chain, dl, Arg, PtrOff,
4051 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
4052}
4053
4054/// Emit a load of return address if tail call
4055/// optimization is performed and it is required.
4056SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
4057 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
4058 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
4059 // Adjust the Return address stack slot.
4060 EVT VT = getPointerTy(DAG.getDataLayout());
4061 OutRetAddr = getReturnAddressFrameIndex(DAG);
4062
4063 // Load the "old" Return address.
4064 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
4065 return SDValue(OutRetAddr.getNode(), 1);
4066}
4067
4068/// Emit a store of the return address if tail call
4069/// optimization is performed and it is required (FPDiff!=0).
4070static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
4071 SDValue Chain, SDValue RetAddrFrIdx,
4072 EVT PtrVT, unsigned SlotSize,
4073 int FPDiff, const SDLoc &dl) {
4074 // Store the return address to the appropriate stack slot.
4075 if (!FPDiff) return Chain;
4076 // Calculate the new stack slot for the return address.
4077 int NewReturnAddrFI =
4078 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
4079 false);
4080 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
4081 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
4082 MachinePointerInfo::getFixedStack(
4083 DAG.getMachineFunction(), NewReturnAddrFI));
4084 return Chain;
4085}
4086
4087/// Returns a vector_shuffle mask for an movs{s|d}, movd
4088/// operation of specified width.
4089static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
4090 SDValue V2) {
4091 unsigned NumElems = VT.getVectorNumElements();
4092 SmallVector<int, 8> Mask;
4093 Mask.push_back(NumElems);
4094 for (unsigned i = 1; i != NumElems; ++i)
4095 Mask.push_back(i);
4096 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4097}
4098
4099SDValue
4100X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
4101 SmallVectorImpl<SDValue> &InVals) const {
4102 SelectionDAG &DAG = CLI.DAG;
4103 SDLoc &dl = CLI.DL;
4104 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
4105 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
4106 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
4107 SDValue Chain = CLI.Chain;
4108 SDValue Callee = CLI.Callee;
4109 CallingConv::ID CallConv = CLI.CallConv;
4110 bool &isTailCall = CLI.IsTailCall;
4111 bool isVarArg = CLI.IsVarArg;
4112 const auto *CB = CLI.CB;
4113
4114 MachineFunction &MF = DAG.getMachineFunction();
4115 bool Is64Bit = Subtarget.is64Bit();
4116 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
4117 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
4118 bool IsSibcall = false;
4119 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
4120 CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;
4121 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
4122 bool HasNCSR = (CB && isa<CallInst>(CB) &&
4123 CB->hasFnAttr("no_caller_saved_registers"));
4124 bool HasNoCfCheck = (CB && CB->doesNoCfCheck());
4125 bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());
4126 const Module *M = MF.getMMI().getModule();
4127 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
4128
4129 MachineFunction::CallSiteInfo CSInfo;
4130 if (CallConv == CallingConv::X86_INTR)
4131 report_fatal_error("X86 interrupts may not be called directly");
4132
4133 bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
4134 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {
4135 // If we are using a GOT, disable tail calls to external symbols with
4136 // default visibility. Tail calling such a symbol requires using a GOT
4137 // relocation, which forces early binding of the symbol. This breaks code
4138 // that require lazy function symbol resolution. Using musttail or
4139 // GuaranteedTailCallOpt will override this.
4140 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4141 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
4142 G->getGlobal()->hasDefaultVisibility()))
4143 isTailCall = false;
4144 }
4145
4146
4147 if (isTailCall && !IsMustTail) {
4148 // Check if it's really possible to do a tail call.
4149 isTailCall = IsEligibleForTailCallOptimization(
4150 Callee, CallConv, SR == StackStructReturn, isVarArg, CLI.RetTy, Outs,
4151 OutVals, Ins, DAG);
4152
4153 // Sibcalls are automatically detected tailcalls which do not require
4154 // ABI changes.
4155 if (!IsGuaranteeTCO && isTailCall)
4156 IsSibcall = true;
4157
4158 if (isTailCall)
4159 ++NumTailCalls;
4160 }
4161
4162 if (IsMustTail && !isTailCall)
4163 report_fatal_error("failed to perform tail call elimination on a call "
4164 "site marked musttail");
4165
4166 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4167, __extension__ __PRETTY_FUNCTION__))
4167 "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4167, __extension__ __PRETTY_FUNCTION__))
;
4168
4169 // Analyze operands of the call, assigning locations to each operand.
4170 SmallVector<CCValAssign, 16> ArgLocs;
4171 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
4172
4173 // Allocate shadow area for Win64.
4174 if (IsWin64)
4175 CCInfo.AllocateStack(32, Align(8));
4176
4177 CCInfo.AnalyzeArguments(Outs, CC_X86);
4178
4179 // In vectorcall calling convention a second pass is required for the HVA
4180 // types.
4181 if (CallingConv::X86_VectorCall == CallConv) {
4182 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
4183 }
4184
4185 // Get a count of how many bytes are to be pushed on the stack.
4186 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
4187 if (IsSibcall)
4188 // This is a sibcall. The memory operands are available in caller's
4189 // own caller's stack.
4190 NumBytes = 0;
4191 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
4192 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
4193
4194 int FPDiff = 0;
4195 if (isTailCall &&
4196 shouldGuaranteeTCO(CallConv,
4197 MF.getTarget().Options.GuaranteedTailCallOpt)) {
4198 // Lower arguments at fp - stackoffset + fpdiff.
4199 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
4200
4201 FPDiff = NumBytesCallerPushed - NumBytes;
4202
4203 // Set the delta of movement of the returnaddr stackslot.
4204 // But only set if delta is greater than previous delta.
4205 if (FPDiff < X86Info->getTCReturnAddrDelta())
4206 X86Info->setTCReturnAddrDelta(FPDiff);
4207 }
4208
4209 unsigned NumBytesToPush = NumBytes;
4210 unsigned NumBytesToPop = NumBytes;
4211
4212 // If we have an inalloca argument, all stack space has already been allocated
4213 // for us and be right at the top of the stack. We don't support multiple
4214 // arguments passed in memory when using inalloca.
4215 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
4216 NumBytesToPush = 0;
4217 if (!ArgLocs.back().isMemLoc())
4218 report_fatal_error("cannot use inalloca attribute on a register "
4219 "parameter");
4220 if (ArgLocs.back().getLocMemOffset() != 0)
4221 report_fatal_error("any parameter with the inalloca attribute must be "
4222 "the only memory argument");
4223 } else if (CLI.IsPreallocated) {
4224 assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4226, __extension__ __PRETTY_FUNCTION__))
4225 "cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4226, __extension__ __PRETTY_FUNCTION__))
4226 "parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4226, __extension__ __PRETTY_FUNCTION__))
;
4227 SmallVector<size_t, 4> PreallocatedOffsets;
4228 for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
4229 if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
4230 PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
4231 }
4232 }
4233 auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
4234 size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
4235 MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
4236 MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
4237 NumBytesToPush = 0;
4238 }
4239
4240 if (!IsSibcall && !IsMustTail)
4241 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
4242 NumBytes - NumBytesToPush, dl);
4243
4244 SDValue RetAddrFrIdx;
4245 // Load return address for tail calls.
4246 if (isTailCall && FPDiff)
4247 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
4248 Is64Bit, FPDiff, dl);
4249
4250 SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
4251 SmallVector<SDValue, 8> MemOpChains;
4252 SDValue StackPtr;
4253
4254 // The next loop assumes that the locations are in the same order of the
4255 // input arguments.
4256 assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4257, __extension__ __PRETTY_FUNCTION__))
4257 "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4257, __extension__ __PRETTY_FUNCTION__))
;
4258
4259 // Walk the register/memloc assignments, inserting copies/loads. In the case
4260 // of tail call optimization arguments are handle later.
4261 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4262 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
4263 ++I, ++OutIndex) {
4264 assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4264, __extension__ __PRETTY_FUNCTION__))
;
4265 // Skip inalloca/preallocated arguments, they have already been written.
4266 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
4267 if (Flags.isInAlloca() || Flags.isPreallocated())
4268 continue;
4269
4270 CCValAssign &VA = ArgLocs[I];
4271 EVT RegVT = VA.getLocVT();
4272 SDValue Arg = OutVals[OutIndex];
4273 bool isByVal = Flags.isByVal();
4274
4275 // Promote the value if needed.
4276 switch (VA.getLocInfo()) {
4277 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4277)
;
4278 case CCValAssign::Full: break;
4279 case CCValAssign::SExt:
4280 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4281 break;
4282 case CCValAssign::ZExt:
4283 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4284 break;
4285 case CCValAssign::AExt:
4286 if (Arg.getValueType().isVector() &&
4287 Arg.getValueType().getVectorElementType() == MVT::i1)
4288 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4289 else if (RegVT.is128BitVector()) {
4290 // Special case: passing MMX values in XMM registers.
4291 Arg = DAG.getBitcast(MVT::i64, Arg);
4292 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4293 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4294 } else
4295 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4296 break;
4297 case CCValAssign::BCvt:
4298 Arg = DAG.getBitcast(RegVT, Arg);
4299 break;
4300 case CCValAssign::Indirect: {
4301 if (isByVal) {
4302 // Memcpy the argument to a temporary stack slot to prevent
4303 // the caller from seeing any modifications the callee may make
4304 // as guaranteed by the `byval` attribute.
4305 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4306 Flags.getByValSize(),
4307 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4308 SDValue StackSlot =
4309 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4310 Chain =
4311 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4312 // From now on treat this as a regular pointer
4313 Arg = StackSlot;
4314 isByVal = false;
4315 } else {
4316 // Store the argument.
4317 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4318 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4319 Chain = DAG.getStore(
4320 Chain, dl, Arg, SpillSlot,
4321 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4322 Arg = SpillSlot;
4323 }
4324 break;
4325 }
4326 }
4327
4328 if (VA.needsCustom()) {
4329 assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4330, __extension__ __PRETTY_FUNCTION__))
4330 "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4330, __extension__ __PRETTY_FUNCTION__))
;
4331 // Split v64i1 value into two registers
4332 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4333 } else if (VA.isRegLoc()) {
4334 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4335 const TargetOptions &Options = DAG.getTarget().Options;
4336 if (Options.EmitCallSiteInfo)
4337 CSInfo.emplace_back(VA.getLocReg(), I);
4338 if (isVarArg && IsWin64) {
4339 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4340 // shadow reg if callee is a varargs function.
4341 Register ShadowReg;
4342 switch (VA.getLocReg()) {
4343 case X86::XMM0: ShadowReg = X86::RCX; break;
4344 case X86::XMM1: ShadowReg = X86::RDX; break;
4345 case X86::XMM2: ShadowReg = X86::R8; break;
4346 case X86::XMM3: ShadowReg = X86::R9; break;
4347 }
4348 if (ShadowReg)
4349 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4350 }
4351 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4352 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4352, __extension__ __PRETTY_FUNCTION__))
;
4353 if (!StackPtr.getNode())
4354 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4355 getPointerTy(DAG.getDataLayout()));
4356 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4357 dl, DAG, VA, Flags, isByVal));
4358 }
4359 }
4360
4361 if (!MemOpChains.empty())
4362 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4363
4364 if (Subtarget.isPICStyleGOT()) {
4365 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4366 // GOT pointer (except regcall).
4367 if (!isTailCall) {
4368 // Indirect call with RegCall calling convertion may use up all the
4369 // general registers, so it is not suitable to bind EBX reister for
4370 // GOT address, just let register allocator handle it.
4371 if (CallConv != CallingConv::X86_RegCall)
4372 RegsToPass.push_back(std::make_pair(
4373 Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4374 getPointerTy(DAG.getDataLayout()))));
4375 } else {
4376 // If we are tail calling and generating PIC/GOT style code load the
4377 // address of the callee into ECX. The value in ecx is used as target of
4378 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4379 // for tail calls on PIC/GOT architectures. Normally we would just put the
4380 // address of GOT into ebx and then call target@PLT. But for tail calls
4381 // ebx would be restored (since ebx is callee saved) before jumping to the
4382 // target@PLT.
4383
4384 // Note: The actual moving to ECX is done further down.
4385 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4386 if (G && !G->getGlobal()->hasLocalLinkage() &&
4387 G->getGlobal()->hasDefaultVisibility())
4388 Callee = LowerGlobalAddress(Callee, DAG);
4389 else if (isa<ExternalSymbolSDNode>(Callee))
4390 Callee = LowerExternalSymbol(Callee, DAG);
4391 }
4392 }
4393
4394 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4395 // From AMD64 ABI document:
4396 // For calls that may call functions that use varargs or stdargs
4397 // (prototype-less calls or calls to functions containing ellipsis (...) in
4398 // the declaration) %al is used as hidden argument to specify the number
4399 // of SSE registers used. The contents of %al do not need to match exactly
4400 // the number of registers, but must be an ubound on the number of SSE
4401 // registers used and is in the range 0 - 8 inclusive.
4402
4403 // Count the number of XMM registers allocated.
4404 static const MCPhysReg XMMArgRegs[] = {
4405 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4406 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4407 };
4408 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4409 assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4410, __extension__ __PRETTY_FUNCTION__))
4410 && "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4410, __extension__ __PRETTY_FUNCTION__))
;
4411 RegsToPass.push_back(std::make_pair(Register(X86::AL),
4412 DAG.getConstant(NumXMMRegs, dl,
4413 MVT::i8)));
4414 }
4415
4416 if (isVarArg && IsMustTail) {
4417 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4418 for (const auto &F : Forwards) {
4419 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4420 RegsToPass.push_back(std::make_pair(F.PReg, Val));
4421 }
4422 }
4423
4424 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4425 // don't need this because the eligibility check rejects calls that require
4426 // shuffling arguments passed in memory.
4427 if (!IsSibcall && isTailCall) {
4428 // Force all the incoming stack arguments to be loaded from the stack
4429 // before any new outgoing arguments are stored to the stack, because the
4430 // outgoing stack slots may alias the incoming argument stack slots, and
4431 // the alias isn't otherwise explicit. This is slightly more conservative
4432 // than necessary, because it means that each store effectively depends
4433 // on every argument instead of just those arguments it would clobber.
4434 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4435
4436 SmallVector<SDValue, 8> MemOpChains2;
4437 SDValue FIN;
4438 int FI = 0;
4439 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4440 ++I, ++OutsIndex) {
4441 CCValAssign &VA = ArgLocs[I];
4442
4443 if (VA.isRegLoc()) {
4444 if (VA.needsCustom()) {
4445 assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4446, __extension__ __PRETTY_FUNCTION__))
4446 "Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4446, __extension__ __PRETTY_FUNCTION__))
;
4447 // This means that we are in special case where one argument was
4448 // passed through two register locations - Skip the next location
4449 ++I;
4450 }
4451
4452 continue;
4453 }
4454
4455 assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4455, __extension__ __PRETTY_FUNCTION__))
;
4456 SDValue Arg = OutVals[OutsIndex];
4457 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4458 // Skip inalloca/preallocated arguments. They don't require any work.
4459 if (Flags.isInAlloca() || Flags.isPreallocated())
4460 continue;
4461 // Create frame index.
4462 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4463 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4464 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4465 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4466
4467 if (Flags.isByVal()) {
4468 // Copy relative to framepointer.
4469 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4470 if (!StackPtr.getNode())
4471 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4472 getPointerTy(DAG.getDataLayout()));
4473 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4474 StackPtr, Source);
4475
4476 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4477 ArgChain,
4478 Flags, DAG, dl));
4479 } else {
4480 // Store relative to framepointer.
4481 MemOpChains2.push_back(DAG.getStore(
4482 ArgChain, dl, Arg, FIN,
4483 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4484 }
4485 }
4486
4487 if (!MemOpChains2.empty())
4488 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4489
4490 // Store the return address to the appropriate stack slot.
4491 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4492 getPointerTy(DAG.getDataLayout()),
4493 RegInfo->getSlotSize(), FPDiff, dl);
4494 }
4495
4496 // Build a sequence of copy-to-reg nodes chained together with token chain
4497 // and flag operands which copy the outgoing args into registers.
4498 SDValue InFlag;
4499 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4500 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4501 RegsToPass[i].second, InFlag);
4502 InFlag = Chain.getValue(1);
4503 }
4504
4505 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4506 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4506, __extension__ __PRETTY_FUNCTION__))
;
4507 // In the 64-bit large code model, we have to make all calls
4508 // through a register, since the call instruction's 32-bit
4509 // pc-relative offset may not be large enough to hold the whole
4510 // address.
4511 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4512 Callee->getOpcode() == ISD::ExternalSymbol) {
4513 // Lower direct calls to global addresses and external symbols. Setting
4514 // ForCall to true here has the effect of removing WrapperRIP when possible
4515 // to allow direct calls to be selected without first materializing the
4516 // address into a register.
4517 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4518 } else if (Subtarget.isTarget64BitILP32() &&
4519 Callee->getValueType(0) == MVT::i32) {
4520 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4521 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4522 }
4523
4524 // Returns a chain & a flag for retval copy to use.
4525 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4526 SmallVector<SDValue, 8> Ops;
4527
4528 if (!IsSibcall && isTailCall && !IsMustTail) {
4529 Chain = DAG.getCALLSEQ_END(Chain,
4530 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4531 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4532 InFlag = Chain.getValue(1);
4533 }
4534
4535 Ops.push_back(Chain);
4536 Ops.push_back(Callee);
4537
4538 if (isTailCall)
4539 Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));
4540
4541 // Add argument registers to the end of the list so that they are known live
4542 // into the call.
4543 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4544 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4545 RegsToPass[i].second.getValueType()));
4546
4547 // Add a register mask operand representing the call-preserved registers.
4548 const uint32_t *Mask = [&]() {
4549 auto AdaptedCC = CallConv;
4550 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),
4551 // use X86_INTR calling convention because it has the same CSR mask
4552 // (same preserved registers).
4553 if (HasNCSR)
4554 AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;
4555 // If NoCalleeSavedRegisters is requested, than use GHC since it happens
4556 // to use the CSR_NoRegs_RegMask.
4557 if (CB && CB->hasFnAttr("no_callee_saved_registers"))
4558 AdaptedCC = (CallingConv::ID)CallingConv::GHC;
4559 return RegInfo->getCallPreservedMask(MF, AdaptedCC);
4560 }();
4561 assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4561, __extension__ __PRETTY_FUNCTION__))
;
4562
4563 // If this is an invoke in a 32-bit function using a funclet-based
4564 // personality, assume the function clobbers all registers. If an exception
4565 // is thrown, the runtime will not restore CSRs.
4566 // FIXME: Model this more precisely so that we can register allocate across
4567 // the normal edge and spill and fill across the exceptional edge.
4568 if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
4569 const Function &CallerFn = MF.getFunction();
4570 EHPersonality Pers =
4571 CallerFn.hasPersonalityFn()
4572 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4573 : EHPersonality::Unknown;
4574 if (isFuncletEHPersonality(Pers))
4575 Mask = RegInfo->getNoPreservedMask();
4576 }
4577
4578 // Define a new register mask from the existing mask.
4579 uint32_t *RegMask = nullptr;
4580
4581 // In some calling conventions we need to remove the used physical registers
4582 // from the reg mask.
4583 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4584 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4585
4586 // Allocate a new Reg Mask and copy Mask.
4587 RegMask = MF.allocateRegMask();
4588 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4589 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4590
4591 // Make sure all sub registers of the argument registers are reset
4592 // in the RegMask.
4593 for (auto const &RegPair : RegsToPass)
4594 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4595 SubRegs.isValid(); ++SubRegs)
4596 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4597
4598 // Create the RegMask Operand according to our updated mask.
4599 Ops.push_back(DAG.getRegisterMask(RegMask));
4600 } else {
4601 // Create the RegMask Operand according to the static mask.
4602 Ops.push_back(DAG.getRegisterMask(Mask));
4603 }
4604
4605 if (InFlag.getNode())
4606 Ops.push_back(InFlag);
4607
4608 if (isTailCall) {
4609 // We used to do:
4610 //// If this is the first return lowered for this function, add the regs
4611 //// to the liveout set for the function.
4612 // This isn't right, although it's probably harmless on x86; liveouts
4613 // should be computed from returns not tail calls. Consider a void
4614 // function making a tail call to a function returning int.
4615 MF.getFrameInfo().setHasTailCall();
4616 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4617 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4618 return Ret;
4619 }
4620
4621 if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {
4622 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4623 } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
4624 // Calls with a "clang.arc.attachedcall" bundle are special. They should be
4625 // expanded to the call, directly followed by a special marker sequence and
4626 // a call to a ObjC library function. Use the CALL_RVMARKER to do that.
4627 assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4628, __extension__ __PRETTY_FUNCTION__))
4628 "tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4628, __extension__ __PRETTY_FUNCTION__))
;
4629 assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4629, __extension__ __PRETTY_FUNCTION__))
;
4630
4631 // Add target constant to select ObjC runtime call just before the call
4632 // target. RuntimeCallType == 0 selects objc_retainAutoreleasedReturnValue,
4633 // RuntimeCallType == 0 selects objc_unsafeClaimAutoreleasedReturnValue when
4634 // epxanding the pseudo.
4635 unsigned RuntimeCallType =
4636 objcarc::hasAttachedCallOpBundle(CLI.CB, true) ? 0 : 1;
4637 Ops.insert(Ops.begin() + 1,
4638 DAG.getTargetConstant(RuntimeCallType, dl, MVT::i32));
4639 Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);
4640 } else {
4641 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4642 }
4643
4644 InFlag = Chain.getValue(1);
4645 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
4646 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4647
4648 // Save heapallocsite metadata.
4649 if (CLI.CB)
4650 if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
4651 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4652
4653 // Create the CALLSEQ_END node.
4654 unsigned NumBytesForCalleeToPop;
4655 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4656 DAG.getTarget().Options.GuaranteedTailCallOpt))
4657 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4658 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4659 !Subtarget.getTargetTriple().isOSMSVCRT() &&
4660 SR == StackStructReturn)
4661 // If this is a call to a struct-return function, the callee
4662 // pops the hidden struct pointer, so we have to push it back.
4663 // This is common for Darwin/X86, Linux & Mingw32 targets.
4664 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4665 NumBytesForCalleeToPop = 4;
4666 else
4667 NumBytesForCalleeToPop = 0; // Callee pops nothing.
4668
4669 // Returns a flag for retval copy to use.
4670 if (!IsSibcall) {
4671 Chain = DAG.getCALLSEQ_END(Chain,
4672 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4673 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4674 true),
4675 InFlag, dl);
4676 InFlag = Chain.getValue(1);
4677 }
4678
4679 // Handle result values, copying them out of physregs into vregs that we
4680 // return.
4681 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4682 InVals, RegMask);
4683}
4684
4685//===----------------------------------------------------------------------===//
4686// Fast Calling Convention (tail call) implementation
4687//===----------------------------------------------------------------------===//
4688
4689// Like std call, callee cleans arguments, convention except that ECX is
4690// reserved for storing the tail called function address. Only 2 registers are
4691// free for argument passing (inreg). Tail call optimization is performed
4692// provided:
4693// * tailcallopt is enabled
4694// * caller/callee are fastcc
4695// On X86_64 architecture with GOT-style position independent code only local
4696// (within module) calls are supported at the moment.
4697// To keep the stack aligned according to platform abi the function
4698// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4699// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4700// If a tail called function callee has more arguments than the caller the
4701// caller needs to make sure that there is room to move the RETADDR to. This is
4702// achieved by reserving an area the size of the argument delta right after the
4703// original RETADDR, but before the saved framepointer or the spilled registers
4704// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4705// stack layout:
4706// arg1
4707// arg2
4708// RETADDR
4709// [ new RETADDR
4710// move area ]
4711// (possible EBP)
4712// ESI
4713// EDI
4714// local1 ..
4715
4716/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4717/// requirement.
4718unsigned
4719X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4720 SelectionDAG &DAG) const {
4721 const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
4722 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4723 assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4724, __extension__ __PRETTY_FUNCTION__))
4724 "StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4724, __extension__ __PRETTY_FUNCTION__))
;
4725 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4726}
4727
4728/// Return true if the given stack call argument is already available in the
4729/// same position (relatively) of the caller's incoming argument stack.
4730static
4731bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4732 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4733 const X86InstrInfo *TII, const CCValAssign &VA) {
4734 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4735
4736 for (;;) {
4737 // Look through nodes that don't alter the bits of the incoming value.
4738 unsigned Op = Arg.getOpcode();
4739 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4740 Arg = Arg.getOperand(0);
4741 continue;
4742 }
4743 if (Op == ISD::TRUNCATE) {
4744 const SDValue &TruncInput = Arg.getOperand(0);
4745 if (TruncInput.getOpcode() == ISD::AssertZext &&
4746 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4747 Arg.getValueType()) {
4748 Arg = TruncInput.getOperand(0);
4749 continue;
4750 }
4751 }
4752 break;
4753 }
4754
4755 int FI = INT_MAX2147483647;
4756 if (Arg.getOpcode() == ISD::CopyFromReg) {
4757 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4758 if (!VR.isVirtual())
4759 return false;
4760 MachineInstr *Def = MRI->getVRegDef(VR);
4761 if (!Def)
4762 return false;
4763 if (!Flags.isByVal()) {
4764 if (!TII->isLoadFromStackSlot(*Def, FI))
4765 return false;
4766 } else {
4767 unsigned Opcode = Def->getOpcode();
4768 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4769 Opcode == X86::LEA64_32r) &&
4770 Def->getOperand(1).isFI()) {
4771 FI = Def->getOperand(1).getIndex();
4772 Bytes = Flags.getByValSize();
4773 } else
4774 return false;
4775 }
4776 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4777 if (Flags.isByVal())
4778 // ByVal argument is passed in as a pointer but it's now being
4779 // dereferenced. e.g.
4780 // define @foo(%struct.X* %A) {
4781 // tail call @bar(%struct.X* byval %A)
4782 // }
4783 return false;
4784 SDValue Ptr = Ld->getBasePtr();
4785 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4786 if (!FINode)
4787 return false;
4788 FI = FINode->getIndex();
4789 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4790 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4791 FI = FINode->getIndex();
4792 Bytes = Flags.getByValSize();
4793 } else
4794 return false;
4795
4796 assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4796, __extension__ __PRETTY_FUNCTION__))
;
4797 if (!MFI.isFixedObjectIndex(FI))
4798 return false;
4799
4800 if (Offset != MFI.getObjectOffset(FI))
4801 return false;
4802
4803 // If this is not byval, check that the argument stack object is immutable.
4804 // inalloca and argument copy elision can create mutable argument stack
4805 // objects. Byval objects can be mutated, but a byval call intends to pass the
4806 // mutated memory.
4807 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4808 return false;
4809
4810 if (VA.getLocVT().getFixedSizeInBits() >
4811 Arg.getValueSizeInBits().getFixedSize()) {
4812 // If the argument location is wider than the argument type, check that any
4813 // extension flags match.
4814 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4815 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4816 return false;
4817 }
4818 }
4819
4820 return Bytes == MFI.getObjectSize(FI);
4821}
4822
4823/// Check whether the call is eligible for tail call optimization. Targets
4824/// that want to do tail call optimization should implement this function.
4825bool X86TargetLowering::IsEligibleForTailCallOptimization(
4826 SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleeStackStructRet,
4827 bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,
4828 const SmallVectorImpl<SDValue> &OutVals,
4829 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4830 if (!mayTailCallThisCC(CalleeCC))
4831 return false;
4832
4833 // If -tailcallopt is specified, make fastcc functions tail-callable.
4834 MachineFunction &MF = DAG.getMachineFunction();
4835 const Function &CallerF = MF.getFunction();
4836
4837 // If the function return type is x86_fp80 and the callee return type is not,
4838 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4839 // perform a tailcall optimization here.
4840 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4841 return false;
4842
4843 CallingConv::ID CallerCC = CallerF.getCallingConv();
4844 bool CCMatch = CallerCC == CalleeCC;
4845 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4846 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4847 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4848 CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;
4849
4850 // Win64 functions have extra shadow space for argument homing. Don't do the
4851 // sibcall if the caller and callee have mismatched expectations for this
4852 // space.
4853 if (IsCalleeWin64 != IsCallerWin64)
4854 return false;
4855
4856 if (IsGuaranteeTCO) {
4857 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4858 return true;
4859 return false;
4860 }
4861
4862 // Look for obvious safe cases to perform tail call optimization that do not
4863 // require ABI changes. This is what gcc calls sibcall.
4864
4865 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4866 // emit a special epilogue.
4867 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4868 if (RegInfo->hasStackRealignment(MF))
4869 return false;
4870
4871 // Also avoid sibcall optimization if we're an sret return fn and the callee
4872 // is incompatible. See comment in LowerReturn about why hasStructRetAttr is
4873 // insufficient.
4874 if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {
4875 // For a compatible tail call the callee must return our sret pointer. So it
4876 // needs to be (a) an sret function itself and (b) we pass our sret as its
4877 // sret. Condition #b is harder to determine.
4878 return false;
4879 } else if (Subtarget.is32Bit() && IsCalleeStackStructRet)
4880 // In the i686 ABI, the sret pointer is callee-pop, so we cannot tail-call,
4881 // as our caller doesn't expect that.
4882 return false;
4883
4884 // Do not sibcall optimize vararg calls unless all arguments are passed via
4885 // registers.
4886 LLVMContext &C = *DAG.getContext();
4887 if (isVarArg && !Outs.empty()) {
4888 // Optimizing for varargs on Win64 is unlikely to be safe without
4889 // additional testing.
4890 if (IsCalleeWin64 || IsCallerWin64)
4891 return false;
4892
4893 SmallVector<CCValAssign, 16> ArgLocs;
4894 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4895
4896 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4897 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4898 if (!ArgLocs[i].isRegLoc())
4899 return false;
4900 }
4901
4902 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4903 // stack. Therefore, if it's not used by the call it is not safe to optimize
4904 // this into a sibcall.
4905 bool Unused = false;
4906 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4907 if (!Ins[i].Used) {
4908 Unused = true;
4909 break;
4910 }
4911 }
4912 if (Unused) {
4913 SmallVector<CCValAssign, 16> RVLocs;
4914 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4915 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4916 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4917 CCValAssign &VA = RVLocs[i];
4918 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4919 return false;
4920 }
4921 }
4922
4923 // Check that the call results are passed in the same way.
4924 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4925 RetCC_X86, RetCC_X86))
4926 return false;
4927 // The callee has to preserve all registers the caller needs to preserve.
4928 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4929 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4930 if (!CCMatch) {
4931 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4932 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4933 return false;
4934 }
4935
4936 unsigned StackArgsSize = 0;
4937
4938 // If the callee takes no arguments then go on to check the results of the
4939 // call.
4940 if (!Outs.empty()) {
4941 // Check if stack adjustment is needed. For now, do not do this if any
4942 // argument is passed on the stack.
4943 SmallVector<CCValAssign, 16> ArgLocs;
4944 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4945
4946 // Allocate shadow area for Win64
4947 if (IsCalleeWin64)
4948 CCInfo.AllocateStack(32, Align(8));
4949
4950 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4951 StackArgsSize = CCInfo.getNextStackOffset();
4952
4953 if (CCInfo.getNextStackOffset()) {
4954 // Check if the arguments are already laid out in the right way as
4955 // the caller's fixed stack objects.
4956 MachineFrameInfo &MFI = MF.getFrameInfo();
4957 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4958 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4959 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4960 CCValAssign &VA = ArgLocs[i];
4961 SDValue Arg = OutVals[i];
4962 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4963 if (VA.getLocInfo() == CCValAssign::Indirect)
4964 return false;
4965 if (!VA.isRegLoc()) {
4966 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4967 MFI, MRI, TII, VA))
4968 return false;
4969 }
4970 }
4971 }
4972
4973 bool PositionIndependent = isPositionIndependent();
4974 // If the tailcall address may be in a register, then make sure it's
4975 // possible to register allocate for it. In 32-bit, the call address can
4976 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4977 // callee-saved registers are restored. These happen to be the same
4978 // registers used to pass 'inreg' arguments so watch out for those.
4979 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4980 !isa<ExternalSymbolSDNode>(Callee)) ||
4981 PositionIndependent)) {
4982 unsigned NumInRegs = 0;
4983 // In PIC we need an extra register to formulate the address computation
4984 // for the callee.
4985 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4986
4987 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4988 CCValAssign &VA = ArgLocs[i];
4989 if (!VA.isRegLoc())
4990 continue;
4991 Register Reg = VA.getLocReg();
4992 switch (Reg) {
4993 default: break;
4994 case X86::EAX: case X86::EDX: case X86::ECX:
4995 if (++NumInRegs == MaxInRegs)
4996 return false;
4997 break;
4998 }
4999 }
5000 }
5001
5002 const MachineRegisterInfo &MRI = MF.getRegInfo();
5003 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
5004 return false;
5005 }
5006
5007 bool CalleeWillPop =
5008 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
5009 MF.getTarget().Options.GuaranteedTailCallOpt);
5010
5011 if (unsigned BytesToPop =
5012 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
5013 // If we have bytes to pop, the callee must pop them.
5014 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
5015 if (!CalleePopMatches)
5016 return false;
5017 } else if (CalleeWillPop && StackArgsSize > 0) {
5018 // If we don't have bytes to pop, make sure the callee doesn't pop any.
5019 return false;
5020 }
5021
5022 return true;
5023}
5024
5025FastISel *
5026X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
5027 const TargetLibraryInfo *libInfo) const {
5028 return X86::createFastISel(funcInfo, libInfo);
5029}
5030
5031//===----------------------------------------------------------------------===//
5032// Other Lowering Hooks
5033//===----------------------------------------------------------------------===//
5034
5035static bool MayFoldLoad(SDValue Op, bool AssumeSingleUse = false) {
5036 return (AssumeSingleUse || Op.hasOneUse()) && ISD::isNormalLoad(Op.getNode());
5037}
5038
5039static bool MayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
5040 bool AssumeSingleUse = false) {
5041 if (!MayFoldLoad(Op, AssumeSingleUse))
5042 return false;
5043
5044 // We can not replace a wide volatile load with a broadcast-from-memory,
5045 // because that would narrow the load, which isn't legal for volatiles.
5046 const LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op.getNode());
5047 return !Ld->isVolatile() ||
5048 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
5049}
5050
5051static bool MayFoldIntoStore(SDValue Op) {
5052 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
5053}
5054
5055static bool MayFoldIntoZeroExtend(SDValue Op) {
5056 if (Op.hasOneUse()) {
5057 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
5058 return (ISD::ZERO_EXTEND == Opcode);
5059 }
5060 return false;
5061}
5062
5063static bool isTargetShuffle(unsigned Opcode) {
5064 switch(Opcode) {
5065 default: return false;
5066 case X86ISD::BLENDI:
5067 case X86ISD::PSHUFB:
5068 case X86ISD::PSHUFD:
5069 case X86ISD::PSHUFHW:
5070 case X86ISD::PSHUFLW:
5071 case X86ISD::SHUFP:
5072 case X86ISD::INSERTPS:
5073 case X86ISD::EXTRQI:
5074 case X86ISD::INSERTQI:
5075 case X86ISD::VALIGN:
5076 case X86ISD::PALIGNR:
5077 case X86ISD::VSHLDQ:
5078 case X86ISD::VSRLDQ:
5079 case X86ISD::MOVLHPS:
5080 case X86ISD::MOVHLPS:
5081 case X86ISD::MOVSHDUP:
5082 case X86ISD::MOVSLDUP:
5083 case X86ISD::MOVDDUP:
5084 case X86ISD::MOVSS:
5085 case X86ISD::MOVSD:
5086 case X86ISD::MOVSH:
5087 case X86ISD::UNPCKL:
5088 case X86ISD::UNPCKH:
5089 case X86ISD::VBROADCAST:
5090 case X86ISD::VPERMILPI:
5091 case X86ISD::VPERMILPV:
5092 case X86ISD::VPERM2X128:
5093 case X86ISD::SHUF128:
5094 case X86ISD::VPERMIL2:
5095 case X86ISD::VPERMI:
5096 case X86ISD::VPPERM:
5097 case X86ISD::VPERMV:
5098 case X86ISD::VPERMV3:
5099 case X86ISD::VZEXT_MOVL:
5100 return true;
5101 }
5102}
5103
5104static bool isTargetShuffleVariableMask(unsigned Opcode) {
5105 switch (Opcode) {
5106 default: return false;
5107 // Target Shuffles.
5108 case X86ISD::PSHUFB:
5109 case X86ISD::VPERMILPV:
5110 case X86ISD::VPERMIL2:
5111 case X86ISD::VPPERM:
5112 case X86ISD::VPERMV:
5113 case X86ISD::VPERMV3:
5114 return true;
5115 // 'Faux' Target Shuffles.
5116 case ISD::OR:
5117 case ISD::AND:
5118 case X86ISD::ANDNP:
5119 return true;
5120 }
5121}
5122
5123static bool isTargetShuffleSplat(SDValue Op) {
5124 unsigned Opcode = Op.getOpcode();
5125 if (Opcode == ISD::EXTRACT_SUBVECTOR)
5126 return isTargetShuffleSplat(Op.getOperand(0));
5127 return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
5128}
5129
5130SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
5131 MachineFunction &MF = DAG.getMachineFunction();
5132 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
5133 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
5134 int ReturnAddrIndex = FuncInfo->getRAIndex();
5135
5136 if (ReturnAddrIndex == 0) {
5137 // Set up a frame object for the return address.
5138 unsigned SlotSize = RegInfo->getSlotSize();
5139 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
5140 -(int64_t)SlotSize,
5141 false);
5142 FuncInfo->setRAIndex(ReturnAddrIndex);
5143 }
5144
5145 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
5146}
5147
5148bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
5149 bool hasSymbolicDisplacement) {
5150 // Offset should fit into 32 bit immediate field.
5151 if (!isInt<32>(Offset))
5152 return false;
5153
5154 // If we don't have a symbolic displacement - we don't have any extra
5155 // restrictions.
5156 if (!hasSymbolicDisplacement)
5157 return true;
5158
5159 // FIXME: Some tweaks might be needed for medium code model.
5160 if (M != CodeModel::Small && M != CodeModel::Kernel)
5161 return false;
5162
5163 // For small code model we assume that latest object is 16MB before end of 31
5164 // bits boundary. We may also accept pretty large negative constants knowing
5165 // that all objects are in the positive half of address space.
5166 if (M == CodeModel::Small && Offset < 16*1024*1024)
5167 return true;
5168
5169 // For kernel code model we know that all object resist in the negative half
5170 // of 32bits address space. We may not accept negative offsets, since they may
5171 // be just off and we may accept pretty large positive ones.
5172 if (M == CodeModel::Kernel && Offset >= 0)
5173 return true;
5174
5175 return false;
5176}
5177
5178/// Determines whether the callee is required to pop its own arguments.
5179/// Callee pop is necessary to support tail calls.
5180bool X86::isCalleePop(CallingConv::ID CallingConv,
5181 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
5182 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
5183 // can guarantee TCO.
5184 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
5185 return true;
5186
5187 switch (CallingConv) {
5188 default:
5189 return false;
5190 case CallingConv::X86_StdCall:
5191 case CallingConv::X86_FastCall:
5192 case CallingConv::X86_ThisCall:
5193 case CallingConv::X86_VectorCall:
5194 return !is64Bit;
5195 }
5196}
5197
5198/// Return true if the condition is an signed comparison operation.
5199static bool isX86CCSigned(unsigned X86CC) {
5200 switch (X86CC) {
5201 default:
5202 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5202)
;
5203 case X86::COND_E:
5204 case X86::COND_NE:
5205 case X86::COND_B:
5206 case X86::COND_A:
5207 case X86::COND_BE:
5208 case X86::COND_AE:
5209 return false;
5210 case X86::COND_G:
5211 case X86::COND_GE:
5212 case X86::COND_L:
5213 case X86::COND_LE:
5214 return true;
5215 }
5216}
5217
5218static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
5219 switch (SetCCOpcode) {
5220 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5220)
;
5221 case ISD::SETEQ: return X86::COND_E;
5222 case ISD::SETGT: return X86::COND_G;
5223 case ISD::SETGE: return X86::COND_GE;
5224 case ISD::SETLT: return X86::COND_L;
5225 case ISD::SETLE: return X86::COND_LE;
5226 case ISD::SETNE: return X86::COND_NE;
5227 case ISD::SETULT: return X86::COND_B;
5228 case ISD::SETUGT: return X86::COND_A;
5229 case ISD::SETULE: return X86::COND_BE;
5230 case ISD::SETUGE: return X86::COND_AE;
5231 }
5232}
5233
5234/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
5235/// condition code, returning the condition code and the LHS/RHS of the
5236/// comparison to make.
5237static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
5238 bool isFP, SDValue &LHS, SDValue &RHS,
5239 SelectionDAG &DAG) {
5240 if (!isFP) {
5241 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
5242 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
5243 // X > -1 -> X == 0, jump !sign.
5244 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5245 return X86::COND_NS;
5246 }
5247 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
5248 // X < 0 -> X == 0, jump on sign.
5249 return X86::COND_S;
5250 }
5251 if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
5252 // X >= 0 -> X == 0, jump on !sign.
5253 return X86::COND_NS;
5254 }
5255 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
5256 // X < 1 -> X <= 0
5257 RHS = DAG.getConstant(0, DL, RHS.getValueType());
5258 return X86::COND_LE;
5259 }
5260 }
5261
5262 return TranslateIntegerX86CC(SetCCOpcode);
5263 }
5264
5265 // First determine if it is required or is profitable to flip the operands.
5266
5267 // If LHS is a foldable load, but RHS is not, flip the condition.
5268 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
5269 !ISD::isNON_EXTLoad(RHS.getNode())) {
5270 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
5271 std::swap(LHS, RHS);
5272 }
5273
5274 switch (SetCCOpcode) {
5275 default: break;
5276 case ISD::SETOLT:
5277 case ISD::SETOLE:
5278 case ISD::SETUGT:
5279 case ISD::SETUGE:
5280 std::swap(LHS, RHS);
5281 break;
5282 }
5283
5284 // On a floating point condition, the flags are set as follows:
5285 // ZF PF CF op
5286 // 0 | 0 | 0 | X > Y
5287 // 0 | 0 | 1 | X < Y
5288 // 1 | 0 | 0 | X == Y
5289 // 1 | 1 | 1 | unordered
5290 switch (SetCCOpcode) {
5291 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5291)
;
5292 case ISD::SETUEQ:
5293 case ISD::SETEQ: return X86::COND_E;
5294 case ISD::SETOLT: // flipped
5295 case ISD::SETOGT:
5296 case ISD::SETGT: return X86::COND_A;
5297 case ISD::SETOLE: // flipped
5298 case ISD::SETOGE:
5299 case ISD::SETGE: return X86::COND_AE;
5300 case ISD::SETUGT: // flipped
5301 case ISD::SETULT:
5302 case ISD::SETLT: return X86::COND_B;
5303 case ISD::SETUGE: // flipped
5304 case ISD::SETULE:
5305 case ISD::SETLE: return X86::COND_BE;
5306 case ISD::SETONE:
5307 case ISD::SETNE: return X86::COND_NE;
5308 case ISD::SETUO: return X86::COND_P;
5309 case ISD::SETO: return X86::COND_NP;
5310 case ISD::SETOEQ:
5311 case ISD::SETUNE: return X86::COND_INVALID;
5312 }
5313}
5314
5315/// Is there a floating point cmov for the specific X86 condition code?
5316/// Current x86 isa includes the following FP cmov instructions:
5317/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
5318static bool hasFPCMov(unsigned X86CC) {
5319 switch (X86CC) {
5320 default:
5321 return false;
5322 case X86::COND_B:
5323 case X86::COND_BE:
5324 case X86::COND_E:
5325 case X86::COND_P:
5326 case X86::COND_A:
5327 case X86::COND_AE:
5328 case X86::COND_NE:
5329 case X86::COND_NP:
5330 return true;
5331 }
5332}
5333
5334
5335bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5336 const CallInst &I,
5337 MachineFunction &MF,
5338 unsigned Intrinsic) const {
5339 Info.flags = MachineMemOperand::MONone;
5340 Info.offset = 0;
5341
5342 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5343 if (!IntrData) {
5344 switch (Intrinsic) {
5345 case Intrinsic::x86_aesenc128kl:
5346 case Intrinsic::x86_aesdec128kl:
5347 Info.opc = ISD::INTRINSIC_W_CHAIN;
5348 Info.ptrVal = I.getArgOperand(1);
5349 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5350 Info.align = Align(1);
5351 Info.flags |= MachineMemOperand::MOLoad;
5352 return true;
5353 case Intrinsic::x86_aesenc256kl:
5354 case Intrinsic::x86_aesdec256kl:
5355 Info.opc = ISD::INTRINSIC_W_CHAIN;
5356 Info.ptrVal = I.getArgOperand(1);
5357 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5358 Info.align = Align(1);
5359 Info.flags |= MachineMemOperand::MOLoad;
5360 return true;
5361 case Intrinsic::x86_aesencwide128kl:
5362 case Intrinsic::x86_aesdecwide128kl:
5363 Info.opc = ISD::INTRINSIC_W_CHAIN;
5364 Info.ptrVal = I.getArgOperand(0);
5365 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
5366 Info.align = Align(1);
5367 Info.flags |= MachineMemOperand::MOLoad;
5368 return true;
5369 case Intrinsic::x86_aesencwide256kl:
5370 case Intrinsic::x86_aesdecwide256kl:
5371 Info.opc = ISD::INTRINSIC_W_CHAIN;
5372 Info.ptrVal = I.getArgOperand(0);
5373 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
5374 Info.align = Align(1);
5375 Info.flags |= MachineMemOperand::MOLoad;
5376 return true;
5377 }
5378 return false;
5379 }
5380
5381 switch (IntrData->Type) {
5382 case TRUNCATE_TO_MEM_VI8:
5383 case TRUNCATE_TO_MEM_VI16:
5384 case TRUNCATE_TO_MEM_VI32: {
5385 Info.opc = ISD::INTRINSIC_VOID;
5386 Info.ptrVal = I.getArgOperand(0);
5387 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5388 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5389 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5390 ScalarVT = MVT::i8;
5391 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5392 ScalarVT = MVT::i16;
5393 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5394 ScalarVT = MVT::i32;
5395
5396 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5397 Info.align = Align(1);
5398 Info.flags |= MachineMemOperand::MOStore;
5399 break;
5400 }
5401 case GATHER:
5402 case GATHER_AVX2: {
5403 Info.opc = ISD::INTRINSIC_W_CHAIN;
5404 Info.ptrVal = nullptr;
5405 MVT DataVT = MVT::getVT(I.getType());
5406 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5407 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5408 IndexVT.getVectorNumElements());
5409 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5410 Info.align = Align(1);
5411 Info.flags |= MachineMemOperand::MOLoad;
5412 break;
5413 }
5414 case SCATTER: {
5415 Info.opc = ISD::INTRINSIC_VOID;
5416 Info.ptrVal = nullptr;
5417 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5418 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5419 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5420 IndexVT.getVectorNumElements());
5421 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5422 Info.align = Align(1);
5423 Info.flags |= MachineMemOperand::MOStore;
5424 break;
5425 }
5426 default:
5427 return false;
5428 }
5429
5430 return true;
5431}
5432
5433/// Returns true if the target can instruction select the
5434/// specified FP immediate natively. If false, the legalizer will
5435/// materialize the FP immediate as a load from a constant pool.
5436bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5437 bool ForCodeSize) const {
5438 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5439 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5440 return true;
5441 }
5442 return false;
5443}
5444
5445bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5446 ISD::LoadExtType ExtTy,
5447 EVT NewVT) const {
5448 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5448, __extension__ __PRETTY_FUNCTION__))
;
5449
5450 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5451 // relocation target a movq or addq instruction: don't let the load shrink.
5452 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5453 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5454 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5455 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5456
5457 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5458 // those uses are extracted directly into a store, then the extract + store
5459 // can be store-folded. Therefore, it's probably not worth splitting the load.
5460 EVT VT = Load->getValueType(0);
5461 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5462 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5463 // Skip uses of the chain value. Result 0 of the node is the load value.
5464 if (UI.getUse().getResNo() != 0)
5465 continue;
5466
5467 // If this use is not an extract + store, it's probably worth splitting.
5468 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5469 UI->use_begin()->getOpcode() != ISD::STORE)
5470 return true;
5471 }
5472 // All non-chain uses are extract + store.
5473 return false;
5474 }
5475
5476 return true;
5477}
5478
5479/// Returns true if it is beneficial to convert a load of a constant
5480/// to just the constant itself.
5481bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5482 Type *Ty) const {
5483 assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5483, __extension__ __PRETTY_FUNCTION__))
;
5484
5485 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5486 if (BitSize == 0 || BitSize > 64)
5487 return false;
5488 return true;
5489}
5490
5491bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5492 // If we are using XMM registers in the ABI and the condition of the select is
5493 // a floating-point compare and we have blendv or conditional move, then it is
5494 // cheaper to select instead of doing a cross-register move and creating a
5495 // load that depends on the compare result.
5496 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5497 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5498}
5499
5500bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5501 // TODO: It might be a win to ease or lift this restriction, but the generic
5502 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5503 if (VT.isVector() && Subtarget.hasAVX512())
5504 return false;
5505
5506 return true;
5507}
5508
5509bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5510 SDValue C) const {
5511 // TODO: We handle scalars using custom code, but generic combining could make
5512 // that unnecessary.
5513 APInt MulC;
5514 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5515 return false;
5516
5517 // Find the type this will be legalized too. Otherwise we might prematurely
5518 // convert this to shl+add/sub and then still have to type legalize those ops.
5519 // Another choice would be to defer the decision for illegal types until
5520 // after type legalization. But constant splat vectors of i64 can't make it
5521 // through type legalization on 32-bit targets so we would need to special
5522 // case vXi64.
5523 while (getTypeAction(Context, VT) != TypeLegal)
5524 VT = getTypeToTransformTo(Context, VT);
5525
5526 // If vector multiply is legal, assume that's faster than shl + add/sub.
5527 // TODO: Multiply is a complex op with higher latency and lower throughput in
5528 // most implementations, so this check could be loosened based on type
5529 // and/or a CPU attribute.
5530 if (isOperationLegal(ISD::MUL, VT))
5531 return false;
5532
5533 // shl+add, shl+sub, shl+add+neg
5534 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5535 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5536}
5537
5538bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5539 unsigned Index) const {
5540 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5541 return false;
5542
5543 // Mask vectors support all subregister combinations and operations that
5544 // extract half of vector.
5545 if (ResVT.getVectorElementType() == MVT::i1)
5546 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5547 (Index == ResVT.getVectorNumElements()));
5548
5549 return (Index % ResVT.getVectorNumElements()) == 0;
5550}
5551
5552bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5553 unsigned Opc = VecOp.getOpcode();
5554
5555 // Assume target opcodes can't be scalarized.
5556 // TODO - do we have any exceptions?
5557 if (Opc >= ISD::BUILTIN_OP_END)
5558 return false;
5559
5560 // If the vector op is not supported, try to convert to scalar.
5561 EVT VecVT = VecOp.getValueType();
5562 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5563 return true;
5564
5565 // If the vector op is supported, but the scalar op is not, the transform may
5566 // not be worthwhile.
5567 EVT ScalarVT = VecVT.getScalarType();
5568 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5569}
5570
5571bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5572 bool) const {
5573 // TODO: Allow vectors?
5574 if (VT.isVector())
5575 return false;
5576 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5577}
5578
5579bool X86TargetLowering::isCheapToSpeculateCttz() const {
5580 // Speculate cttz only if we can directly use TZCNT.
5581 return Subtarget.hasBMI();
5582}
5583
5584bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5585 // Speculate ctlz only if we can directly use LZCNT.
5586 return Subtarget.hasLZCNT();
5587}
5588
5589bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5590 const SelectionDAG &DAG,
5591 const MachineMemOperand &MMO) const {
5592 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5593 BitcastVT.getVectorElementType() == MVT::i1)
5594 return false;
5595
5596 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5597 return false;
5598
5599 // If both types are legal vectors, it's always ok to convert them.
5600 if (LoadVT.isVector() && BitcastVT.isVector() &&
5601 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5602 return true;
5603
5604 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5605}
5606
5607bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5608 const MachineFunction &MF) const {
5609 // Do not merge to float value size (128 bytes) if no implicit
5610 // float attribute is set.
5611 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
5612
5613 if (NoFloat) {
5614 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5615 return (MemVT.getSizeInBits() <= MaxIntSize);
5616 }
5617 // Make sure we don't merge greater than our preferred vector
5618 // width.
5619 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5620 return false;
5621
5622 return true;
5623}
5624
5625bool X86TargetLowering::isCtlzFast() const {
5626 return Subtarget.hasFastLZCNT();
5627}
5628
5629bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5630 const Instruction &AndI) const {
5631 return true;
5632}
5633
5634bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5635 EVT VT = Y.getValueType();
5636
5637 if (VT.isVector())
5638 return false;
5639
5640 if (!Subtarget.hasBMI())
5641 return false;
5642
5643 // There are only 32-bit and 64-bit forms for 'andn'.
5644 if (VT != MVT::i32 && VT != MVT::i64)
5645 return false;
5646
5647 return !isa<ConstantSDNode>(Y);
5648}
5649
5650bool X86TargetLowering::hasAndNot(SDValue Y) const {
5651 EVT VT = Y.getValueType();
5652
5653 if (!VT.isVector())
5654 return hasAndNotCompare(Y);
5655
5656 // Vector.
5657
5658 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5659 return false;
5660
5661 if (VT == MVT::v4i32)
5662 return true;
5663
5664 return Subtarget.hasSSE2();
5665}
5666
5667bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5668 return X.getValueType().isScalarInteger(); // 'bt'
5669}
5670
5671bool X86TargetLowering::
5672 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5673 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5674 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5675 SelectionDAG &DAG) const {
5676 // Does baseline recommend not to perform the fold by default?
5677 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5678 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5679 return false;
5680 // For scalars this transform is always beneficial.
5681 if (X.getValueType().isScalarInteger())
5682 return true;
5683 // If all the shift amounts are identical, then transform is beneficial even
5684 // with rudimentary SSE2 shifts.
5685 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5686 return true;
5687 // If we have AVX2 with it's powerful shift operations, then it's also good.
5688 if (Subtarget.hasAVX2())
5689 return true;
5690 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5691 return NewShiftOpcode == ISD::SHL;
5692}
5693
5694bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5695 const SDNode *N, CombineLevel Level) const {
5696 assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5700, __extension__ __PRETTY_FUNCTION__))
5697 N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5700, __extension__ __PRETTY_FUNCTION__))
5698 (N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5700, __extension__ __PRETTY_FUNCTION__))
5699 N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5700, __extension__ __PRETTY_FUNCTION__))
5700 "Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5700, __extension__ __PRETTY_FUNCTION__))
;
5701 EVT VT = N->getValueType(0);
5702 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5703 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5704 // Only fold if the shift values are equal - so it folds to AND.
5705 // TODO - we should fold if either is a non-uniform vector but we don't do
5706 // the fold for non-splats yet.
5707 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5708 }
5709 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5710}
5711
5712bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5713 EVT VT = Y.getValueType();
5714
5715 // For vectors, we don't have a preference, but we probably want a mask.
5716 if (VT.isVector())
5717 return false;
5718
5719 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5720 if (VT == MVT::i64 && !Subtarget.is64Bit())
5721 return false;
5722
5723 return true;
5724}
5725
5726bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5727 SDNode *N) const {
5728 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5729 !Subtarget.isOSWindows())
5730 return false;
5731 return true;
5732}
5733
5734bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5735 // Any legal vector type can be splatted more efficiently than
5736 // loading/spilling from memory.
5737 return isTypeLegal(VT);
5738}
5739
5740MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5741 MVT VT = MVT::getIntegerVT(NumBits);
5742 if (isTypeLegal(VT))
5743 return VT;
5744
5745 // PMOVMSKB can handle this.
5746 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5747 return MVT::v16i8;
5748
5749 // VPMOVMSKB can handle this.
5750 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5751 return MVT::v32i8;
5752
5753 // TODO: Allow 64-bit type for 32-bit target.
5754 // TODO: 512-bit types should be allowed, but make sure that those
5755 // cases are handled in combineVectorSizedSetCCEquality().
5756
5757 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5758}
5759
5760/// Val is the undef sentinel value or equal to the specified value.
5761static bool isUndefOrEqual(int Val, int CmpVal) {
5762 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5763}
5764
5765/// Return true if every element in Mask is the undef sentinel value or equal to
5766/// the specified value..
5767static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
5768 return llvm::all_of(Mask, [CmpVal](int M) {
5769 return (M == SM_SentinelUndef) || (M == CmpVal);
5770 });
5771}
5772
5773/// Val is either the undef or zero sentinel value.
5774static bool isUndefOrZero(int Val) {
5775 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5776}
5777
5778/// Return true if every element in Mask, beginning from position Pos and ending
5779/// in Pos+Size is the undef sentinel value.
5780static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5781 return llvm::all_of(Mask.slice(Pos, Size),
5782 [](int M) { return M == SM_SentinelUndef; });
5783}
5784
5785/// Return true if the mask creates a vector whose lower half is undefined.
5786static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5787 unsigned NumElts = Mask.size();
5788 return isUndefInRange(Mask, 0, NumElts / 2);
5789}
5790
5791/// Return true if the mask creates a vector whose upper half is undefined.
5792static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5793 unsigned NumElts = Mask.size();
5794 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5795}
5796
5797/// Return true if Val falls within the specified range (L, H].
5798static bool isInRange(int Val, int Low, int Hi) {
5799 return (Val >= Low && Val < Hi);
5800}
5801
5802/// Return true if the value of any element in Mask falls within the specified
5803/// range (L, H].
5804static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5805 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5806}
5807
5808/// Return true if the value of any element in Mask is the zero sentinel value.
5809static bool isAnyZero(ArrayRef<int> Mask) {
5810 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
5811}
5812
5813/// Return true if the value of any element in Mask is the zero or undef
5814/// sentinel values.
5815static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
5816 return llvm::any_of(Mask, [](int M) {
5817 return M == SM_SentinelZero || M == SM_SentinelUndef;
5818 });
5819}
5820
5821/// Return true if Val is undef or if its value falls within the
5822/// specified range (L, H].
5823static bool isUndefOrInRange(int Val, int Low, int Hi) {
5824 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5825}
5826
5827/// Return true if every element in Mask is undef or if its value
5828/// falls within the specified range (L, H].
5829static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5830 return llvm::all_of(
5831 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5832}
5833
5834/// Return true if Val is undef, zero or if its value falls within the
5835/// specified range (L, H].
5836static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5837 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5838}
5839
5840/// Return true if every element in Mask is undef, zero or if its value
5841/// falls within the specified range (L, H].
5842static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5843 return llvm::all_of(
5844 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5845}
5846
5847/// Return true if every element in Mask, beginning
5848/// from position Pos and ending in Pos + Size, falls within the specified
5849/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5850static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5851 unsigned Size, int Low, int Step = 1) {
5852 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5853 if (!isUndefOrEqual(Mask[i], Low))
5854 return false;
5855 return true;
5856}
5857
5858/// Return true if every element in Mask, beginning
5859/// from position Pos and ending in Pos+Size, falls within the specified
5860/// sequential range (Low, Low+Size], or is undef or is zero.
5861static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5862 unsigned Size, int Low,
5863 int Step = 1) {
5864 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5865 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5866 return false;
5867 return true;
5868}
5869
5870/// Return true if every element in Mask, beginning
5871/// from position Pos and ending in Pos+Size is undef or is zero.
5872static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5873 unsigned Size) {
5874 return llvm::all_of(Mask.slice(Pos, Size),
5875 [](int M) { return isUndefOrZero(M); });
5876}
5877
5878/// Helper function to test whether a shuffle mask could be
5879/// simplified by widening the elements being shuffled.
5880///
5881/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5882/// leaves it in an unspecified state.
5883///
5884/// NOTE: This must handle normal vector shuffle masks and *target* vector
5885/// shuffle masks. The latter have the special property of a '-2' representing
5886/// a zero-ed lane of a vector.
5887static bool canWidenShuffleElements(ArrayRef<int> Mask,
5888 SmallVectorImpl<int> &WidenedMask) {
5889 WidenedMask.assign(Mask.size() / 2, 0);
5890 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5891 int M0 = Mask[i];
5892 int M1 = Mask[i + 1];
5893
5894 // If both elements are undef, its trivial.
5895 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5896 WidenedMask[i / 2] = SM_SentinelUndef;
5897 continue;
5898 }
5899
5900 // Check for an undef mask and a mask value properly aligned to fit with
5901 // a pair of values. If we find such a case, use the non-undef mask's value.
5902 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5903 WidenedMask[i / 2] = M1 / 2;
5904 continue;
5905 }
5906 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5907 WidenedMask[i / 2] = M0 / 2;
5908 continue;
5909 }
5910
5911 // When zeroing, we need to spread the zeroing across both lanes to widen.
5912 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5913 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5914 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5915 WidenedMask[i / 2] = SM_SentinelZero;
5916 continue;
5917 }
5918 return false;
5919 }
5920
5921 // Finally check if the two mask values are adjacent and aligned with
5922 // a pair.
5923 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5924 WidenedMask[i / 2] = M0 / 2;
5925 continue;
5926 }
5927
5928 // Otherwise we can't safely widen the elements used in this shuffle.
5929 return false;
5930 }
5931 assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5932, __extension__ __PRETTY_FUNCTION__))
5932 "Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5932, __extension__ __PRETTY_FUNCTION__))
;
5933
5934 return true;
5935}
5936
5937static bool canWidenShuffleElements(ArrayRef<int> Mask,
5938 const APInt &Zeroable,
5939 bool V2IsZero,
5940 SmallVectorImpl<int> &WidenedMask) {
5941 // Create an alternative mask with info about zeroable elements.
5942 // Here we do not set undef elements as zeroable.
5943 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5944 if (V2IsZero) {
5945 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isNullValue() &&
"V2's non-undef elements are used?!") ? void (0) : __assert_fail
("!Zeroable.isNullValue() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5945, __extension__ __PRETTY_FUNCTION__))
;
5946 for (int i = 0, Size = Mask.size(); i != Size; ++i)
5947 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5948 ZeroableMask[i] = SM_SentinelZero;
5949 }
5950 return canWidenShuffleElements(ZeroableMask, WidenedMask);
5951}
5952
5953static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5954 SmallVector<int, 32> WidenedMask;
5955 return canWidenShuffleElements(Mask, WidenedMask);
5956}
5957
5958// Attempt to narrow/widen shuffle mask until it matches the target number of
5959// elements.
5960static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
5961 SmallVectorImpl<int> &ScaledMask) {
5962 unsigned NumSrcElts = Mask.size();
5963 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5964, __extension__ __PRETTY_FUNCTION__))
5964 "Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5964, __extension__ __PRETTY_FUNCTION__))
;
5965
5966 // Narrowing is guaranteed to work.
5967 if (NumDstElts >= NumSrcElts) {
5968 int Scale = NumDstElts / NumSrcElts;
5969 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
5970 return true;
5971 }
5972
5973 // We have to repeat the widening until we reach the target size, but we can
5974 // split out the first widening as it sets up ScaledMask for us.
5975 if (canWidenShuffleElements(Mask, ScaledMask)) {
5976 while (ScaledMask.size() > NumDstElts) {
5977 SmallVector<int, 16> WidenedMask;
5978 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
5979 return false;
5980 ScaledMask = std::move(WidenedMask);
5981 }
5982 return true;
5983 }
5984
5985 return false;
5986}
5987
5988/// Returns true if Elt is a constant zero or a floating point constant +0.0.
5989bool X86::isZeroNode(SDValue Elt) {
5990 return isNullConstant(Elt) || isNullFPConstant(Elt);
5991}
5992
5993// Build a vector of constants.
5994// Use an UNDEF node if MaskElt == -1.
5995// Split 64-bit constants in the 32-bit mode.
5996static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5997 const SDLoc &dl, bool IsMask = false) {
5998
5999 SmallVector<SDValue, 32> Ops;
6000 bool Split = false;
6001
6002 MVT ConstVecVT = VT;
6003 unsigned NumElts = VT.getVectorNumElements();
6004 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6005 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6006 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6007 Split = true;
6008 }
6009
6010 MVT EltVT = ConstVecVT.getVectorElementType();
6011 for (unsigned i = 0; i < NumElts; ++i) {
6012 bool IsUndef = Values[i] < 0 && IsMask;
6013 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
6014 DAG.getConstant(Values[i], dl, EltVT);
6015 Ops.push_back(OpNode);
6016 if (Split)
6017 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
6018 DAG.getConstant(0, dl, EltVT));
6019 }
6020 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6021 if (Split)
6022 ConstsNode = DAG.getBitcast(VT, ConstsNode);
6023 return ConstsNode;
6024}
6025
6026static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
6027 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6028 assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6029, __extension__ __PRETTY_FUNCTION__))
6029 "Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6029, __extension__ __PRETTY_FUNCTION__))
;
6030 SmallVector<SDValue, 32> Ops;
6031 bool Split = false;
6032
6033 MVT ConstVecVT = VT;
6034 unsigned NumElts = VT.getVectorNumElements();
6035 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
6036 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
6037 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
6038 Split = true;
6039 }
6040
6041 MVT EltVT = ConstVecVT.getVectorElementType();
6042 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
6043 if (Undefs[i]) {
6044 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
6045 continue;
6046 }
6047 const APInt &V = Bits[i];
6048 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6048, __extension__ __PRETTY_FUNCTION__))
;
6049 if (Split) {
6050 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
6051 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
6052 } else if (EltVT == MVT::f32) {
6053 APFloat FV(APFloat::IEEEsingle(), V);
6054 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6055 } else if (EltVT == MVT::f64) {
6056 APFloat FV(APFloat::IEEEdouble(), V);
6057 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
6058 } else {
6059 Ops.push_back(DAG.getConstant(V, dl, EltVT));
6060 }
6061 }
6062
6063 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
6064 return DAG.getBitcast(VT, ConstsNode);
6065}
6066
6067/// Returns a vector of specified type with all zero elements.
6068static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
6069 SelectionDAG &DAG, const SDLoc &dl) {
6070 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6072, __extension__ __PRETTY_FUNCTION__))
6071 VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6072, __extension__ __PRETTY_FUNCTION__))
6072 "Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6072, __extension__ __PRETTY_FUNCTION__))
;
6073
6074 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
6075 // type. This ensures they get CSE'd. But if the integer type is not
6076 // available, use a floating-point +0.0 instead.
6077 SDValue Vec;
6078 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
6079 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
6080 } else if (VT.isFloatingPoint()) {
6081 Vec = DAG.getConstantFP(+0.0, dl, VT);
6082 } else if (VT.getVectorElementType() == MVT::i1) {
6083 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6084, __extension__ __PRETTY_FUNCTION__))
6084 "Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6084, __extension__ __PRETTY_FUNCTION__))
;
6085 Vec = DAG.getConstant(0, dl, VT);
6086 } else {
6087 unsigned Num32BitElts = VT.getSizeInBits() / 32;
6088 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
6089 }
6090 return DAG.getBitcast(VT, Vec);
6091}
6092
6093static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
6094 const SDLoc &dl, unsigned vectorWidth) {
6095 EVT VT = Vec.getValueType();
6096 EVT ElVT = VT.getVectorElementType();
6097 unsigned Factor = VT.getSizeInBits() / vectorWidth;
6098 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
6099 VT.getVectorNumElements() / Factor);
6100
6101 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
6102 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
6103 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6103, __extension__ __PRETTY_FUNCTION__))
;
6104
6105 // This is the index of the first element of the vectorWidth-bit chunk
6106 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6107 IdxVal &= ~(ElemsPerChunk - 1);
6108
6109 // If the input is a buildvector just emit a smaller one.
6110 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
6111 return DAG.getBuildVector(ResultVT, dl,
6112 Vec->ops().slice(IdxVal, ElemsPerChunk));
6113
6114 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6115 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
6116}
6117
6118/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
6119/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
6120/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
6121/// instructions or a simple subregister reference. Idx is an index in the
6122/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
6123/// lowering EXTRACT_VECTOR_ELT operations easier.
6124static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
6125 SelectionDAG &DAG, const SDLoc &dl) {
6126 assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6127, __extension__ __PRETTY_FUNCTION__))
6127 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6127, __extension__ __PRETTY_FUNCTION__))
;
6128 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
6129}
6130
6131/// Generate a DAG to grab 256-bits from a 512-bit vector.
6132static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
6133 SelectionDAG &DAG, const SDLoc &dl) {
6134 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6134, __extension__ __PRETTY_FUNCTION__))
;
6135 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
6136}
6137
6138static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6139 SelectionDAG &DAG, const SDLoc &dl,
6140 unsigned vectorWidth) {
6141 assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6142, __extension__ __PRETTY_FUNCTION__))
6142 "Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6142, __extension__ __PRETTY_FUNCTION__))
;
6143 // Inserting UNDEF is Result
6144 if (Vec.isUndef())
6145 return Result;
6146 EVT VT = Vec.getValueType();
6147 EVT ElVT = VT.getVectorElementType();
6148 EVT ResultVT = Result.getValueType();
6149
6150 // Insert the relevant vectorWidth bits.
6151 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
6152 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6152, __extension__ __PRETTY_FUNCTION__))
;
6153
6154 // This is the index of the first element of the vectorWidth-bit chunk
6155 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
6156 IdxVal &= ~(ElemsPerChunk - 1);
6157
6158 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
6159 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
6160}
6161
6162/// Generate a DAG to put 128-bits into a vector > 128 bits. This
6163/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
6164/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
6165/// simple superregister reference. Idx is an index in the 128 bits
6166/// we want. It need not be aligned to a 128-bit boundary. That makes
6167/// lowering INSERT_VECTOR_ELT operations easier.
6168static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
6169 SelectionDAG &DAG, const SDLoc &dl) {
6170 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6170, __extension__ __PRETTY_FUNCTION__))
;
6171 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
6172}
6173
6174/// Widen a vector to a larger size with the same scalar type, with the new
6175/// elements either zero or undef.
6176static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
6177 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6178 const SDLoc &dl) {
6179 assert(Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6181, __extension__ __PRETTY_FUNCTION__))
6180 Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6181, __extension__ __PRETTY_FUNCTION__))
6181 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedSize
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedSize() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6181, __extension__ __PRETTY_FUNCTION__))
;
6182 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
6183 : DAG.getUNDEF(VT);
6184 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
6185 DAG.getIntPtrConstant(0, dl));
6186}
6187
6188/// Widen a vector to a larger size with the same scalar type, with the new
6189/// elements either zero or undef.
6190static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
6191 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6192 const SDLoc &dl, unsigned WideSizeInBits) {
6193 assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6195, __extension__ __PRETTY_FUNCTION__))
6194 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6195, __extension__ __PRETTY_FUNCTION__))
6195 "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6195, __extension__ __PRETTY_FUNCTION__))
;
6196 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
6197 MVT SVT = Vec.getSimpleValueType().getScalarType();
6198 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
6199 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
6200}
6201
6202// Helper function to collect subvector ops that are concatenated together,
6203// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
6204// The subvectors in Ops are guaranteed to be the same type.
6205static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
6206 assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6206, __extension__ __PRETTY_FUNCTION__))
;
6207
6208 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
6209 Ops.append(N->op_begin(), N->op_end());
6210 return true;
6211 }
6212
6213 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
6214 SDValue Src = N->getOperand(0);
6215 SDValue Sub = N->getOperand(1);
6216 const APInt &Idx = N->getConstantOperandAPInt(2);
6217 EVT VT = Src.getValueType();
6218 EVT SubVT = Sub.getValueType();
6219
6220 // TODO - Handle more general insert_subvector chains.
6221 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
6222 Idx == (VT.getVectorNumElements() / 2)) {
6223 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
6224 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6225 Src.getOperand(1).getValueType() == SubVT &&
6226 isNullConstant(Src.getOperand(2))) {
6227 Ops.push_back(Src.getOperand(1));
6228 Ops.push_back(Sub);
6229 return true;
6230 }
6231 // insert_subvector(x, extract_subvector(x, lo), hi)
6232 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6233 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
6234 Ops.append(2, Sub);
6235 return true;
6236 }
6237 }
6238 }
6239
6240 return false;
6241}
6242
6243static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
6244 const SDLoc &dl) {
6245 EVT VT = Op.getValueType();
6246 unsigned NumElems = VT.getVectorNumElements();
6247 unsigned SizeInBits = VT.getSizeInBits();
6248 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6249, __extension__ __PRETTY_FUNCTION__))
6249 "Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6249, __extension__ __PRETTY_FUNCTION__))
;
6250
6251 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
6252 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
6253 return std::make_pair(Lo, Hi);
6254}
6255
6256// Split an unary integer op into 2 half sized ops.
6257static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
6258 EVT VT = Op.getValueType();
6259
6260 // Make sure we only try to split 256/512-bit types to avoid creating
6261 // narrow vectors.
6262 assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6264, __extension__ __PRETTY_FUNCTION__))
6263 Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6264, __extension__ __PRETTY_FUNCTION__))
6264 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6264, __extension__ __PRETTY_FUNCTION__))
;
6265 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6267, __extension__ __PRETTY_FUNCTION__))
6266 VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6267, __extension__ __PRETTY_FUNCTION__))
6267 "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6267, __extension__ __PRETTY_FUNCTION__))
;
6268
6269 SDLoc dl(Op);
6270
6271 // Extract the Lo/Hi vectors
6272 SDValue Lo, Hi;
6273 std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
6274
6275 EVT LoVT, HiVT;
6276 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6277 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6278 DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
6279 DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
6280}
6281
6282/// Break a binary integer operation into 2 half sized ops and then
6283/// concatenate the result back.
6284static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
6285 EVT VT = Op.getValueType();
6286
6287 // Sanity check that all the types match.
6288 assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6289, __extension__ __PRETTY_FUNCTION__))
6289 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6289, __extension__ __PRETTY_FUNCTION__))
;
6290 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6290, __extension__ __PRETTY_FUNCTION__))
;
6291
6292 SDLoc dl(Op);
6293
6294 // Extract the LHS Lo/Hi vectors
6295 SDValue LHS1, LHS2;
6296 std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
6297
6298 // Extract the RHS Lo/Hi vectors
6299 SDValue RHS1, RHS2;
6300 std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
6301
6302 EVT LoVT, HiVT;
6303 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
6304 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
6305 DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
6306 DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
6307}
6308
6309// Helper for splitting operands of an operation to legal target size and
6310// apply a function on each part.
6311// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
6312// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
6313// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
6314// The argument Builder is a function that will be applied on each split part:
6315// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
6316template <typename F>
6317SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
6318 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
6319 F Builder, bool CheckBWI = true) {
6320 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6320, __extension__ __PRETTY_FUNCTION__))
;
6321 unsigned NumSubs = 1;
6322 if ((CheckBWI && Subtarget.useBWIRegs()) ||
6323 (!CheckBWI && Subtarget.useAVX512Regs())) {
6324 if (VT.getSizeInBits() > 512) {
6325 NumSubs = VT.getSizeInBits() / 512;
6326 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6326, __extension__ __PRETTY_FUNCTION__))
;
6327 }
6328 } else if (Subtarget.hasAVX2()) {
6329 if (VT.getSizeInBits() > 256) {
6330 NumSubs = VT.getSizeInBits() / 256;
6331 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6331, __extension__ __PRETTY_FUNCTION__))
;
6332 }
6333 } else {
6334 if (VT.getSizeInBits() > 128) {
6335 NumSubs = VT.getSizeInBits() / 128;
6336 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6336, __extension__ __PRETTY_FUNCTION__))
;
6337 }
6338 }
6339
6340 if (NumSubs == 1)
6341 return Builder(DAG, DL, Ops);
6342
6343 SmallVector<SDValue, 4> Subs;
6344 for (unsigned i = 0; i != NumSubs; ++i) {
6345 SmallVector<SDValue, 2> SubOps;
6346 for (SDValue Op : Ops) {
6347 EVT OpVT = Op.getValueType();
6348 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
6349 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
6350 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
6351 }
6352 Subs.push_back(Builder(DAG, DL, SubOps));
6353 }
6354 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
6355}
6356
6357/// Insert i1-subvector to i1-vector.
6358static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
6359 const X86Subtarget &Subtarget) {
6360
6361 SDLoc dl(Op);
6362 SDValue Vec = Op.getOperand(0);
6363 SDValue SubVec = Op.getOperand(1);
6364 SDValue Idx = Op.getOperand(2);
6365 unsigned IdxVal = Op.getConstantOperandVal(2);
6366
6367 // Inserting undef is a nop. We can just return the original vector.
6368 if (SubVec.isUndef())
6369 return Vec;
6370
6371 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
6372 return Op;
6373
6374 MVT OpVT = Op.getSimpleValueType();
6375 unsigned NumElems = OpVT.getVectorNumElements();
6376 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6377
6378 // Extend to natively supported kshift.
6379 MVT WideOpVT = OpVT;
6380 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
6381 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
6382
6383 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
6384 // if necessary.
6385 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
6386 // May need to promote to a legal type.
6387 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6388 DAG.getConstant(0, dl, WideOpVT),
6389 SubVec, Idx);
6390 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6391 }
6392
6393 MVT SubVecVT = SubVec.getSimpleValueType();
6394 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
6395 assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6397, __extension__ __PRETTY_FUNCTION__))
6396 IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6397, __extension__ __PRETTY_FUNCTION__))
6397 "Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6397, __extension__ __PRETTY_FUNCTION__))
;
6398
6399 SDValue Undef = DAG.getUNDEF(WideOpVT);
6400
6401 if (IdxVal == 0) {
6402 // Zero lower bits of the Vec
6403 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
6404 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
6405 ZeroIdx);
6406 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6407 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6408 // Merge them together, SubVec should be zero extended.
6409 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6410 DAG.getConstant(0, dl, WideOpVT),
6411 SubVec, ZeroIdx);
6412 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6413 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6414 }
6415
6416 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6417 Undef, SubVec, ZeroIdx);
6418
6419 if (Vec.isUndef()) {
6420 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6420, __extension__ __PRETTY_FUNCTION__))
;
6421 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6422 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6423 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6424 }
6425
6426 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
6427 assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6427, __extension__ __PRETTY_FUNCTION__))
;
6428 // If upper elements of Vec are known undef, then just shift into place.
6429 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
6430 [](SDValue V) { return V.isUndef(); })) {
6431 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6432 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6433 } else {
6434 NumElems = WideOpVT.getVectorNumElements();
6435 unsigned ShiftLeft = NumElems - SubVecNumElems;
6436 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6437 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6438 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6439 if (ShiftRight != 0)
6440 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6441 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6442 }
6443 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6444 }
6445
6446 // Simple case when we put subvector in the upper part
6447 if (IdxVal + SubVecNumElems == NumElems) {
6448 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6449 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
6450 if (SubVecNumElems * 2 == NumElems) {
6451 // Special case, use legal zero extending insert_subvector. This allows
6452 // isel to optimize when bits are known zero.
6453 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
6454 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6455 DAG.getConstant(0, dl, WideOpVT),
6456 Vec, ZeroIdx);
6457 } else {
6458 // Otherwise use explicit shifts to zero the bits.
6459 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
6460 Undef, Vec, ZeroIdx);
6461 NumElems = WideOpVT.getVectorNumElements();
6462 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
6463 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
6464 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
6465 }
6466 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6467 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6468 }
6469
6470 // Inserting into the middle is more complicated.
6471
6472 NumElems = WideOpVT.getVectorNumElements();
6473
6474 // Widen the vector if needed.
6475 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
6476
6477 unsigned ShiftLeft = NumElems - SubVecNumElems;
6478 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
6479
6480 // Do an optimization for the the most frequently used types.
6481 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
6482 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
6483 Mask0.flipAllBits();
6484 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
6485 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
6486 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
6487 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6488 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6489 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6490 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6491 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
6492
6493 // Reduce to original width if needed.
6494 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6495 }
6496
6497 // Clear the upper bits of the subvector and move it to its insert position.
6498 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6499 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6500 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6501 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6502
6503 // Isolate the bits below the insertion point.
6504 unsigned LowShift = NumElems - IdxVal;
6505 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6506 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6507 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6508 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6509
6510 // Isolate the bits after the last inserted bit.
6511 unsigned HighShift = IdxVal + SubVecNumElems;
6512 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6513 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6514 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6515 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6516
6517 // Now OR all 3 pieces together.
6518 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6519 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6520
6521 // Reduce to original width if needed.
6522 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6523}
6524
6525static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6526 const SDLoc &dl) {
6527 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6527, __extension__ __PRETTY_FUNCTION__))
;
6528 EVT SubVT = V1.getValueType();
6529 EVT SubSVT = SubVT.getScalarType();
6530 unsigned SubNumElts = SubVT.getVectorNumElements();
6531 unsigned SubVectorWidth = SubVT.getSizeInBits();
6532 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6533 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6534 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6535}
6536
6537/// Returns a vector of specified type with all bits set.
6538/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6539/// Then bitcast to their original type, ensuring they get CSE'd.
6540static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6541 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6542, __extension__ __PRETTY_FUNCTION__))
6542 "Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6542, __extension__ __PRETTY_FUNCTION__))
;
6543
6544 APInt Ones = APInt::getAllOnesValue(32);
6545 unsigned NumElts = VT.getSizeInBits() / 32;
6546 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6547 return DAG.getBitcast(VT, Vec);
6548}
6549
6550// Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
6551static unsigned getOpcode_EXTEND(unsigned Opcode) {
6552 switch (Opcode) {
6553 case ISD::ANY_EXTEND:
6554 case ISD::ANY_EXTEND_VECTOR_INREG:
6555 return ISD::ANY_EXTEND;
6556 case ISD::ZERO_EXTEND:
6557 case ISD::ZERO_EXTEND_VECTOR_INREG:
6558 return ISD::ZERO_EXTEND;
6559 case ISD::SIGN_EXTEND:
6560 case ISD::SIGN_EXTEND_VECTOR_INREG:
6561 return ISD::SIGN_EXTEND;
6562 }
6563 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6563)
;
6564}
6565
6566// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6567static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6568 switch (Opcode) {
6569 case ISD::ANY_EXTEND:
6570 case ISD::ANY_EXTEND_VECTOR_INREG:
6571 return ISD::ANY_EXTEND_VECTOR_INREG;
6572 case ISD::ZERO_EXTEND:
6573 case ISD::ZERO_EXTEND_VECTOR_INREG:
6574 return ISD::ZERO_EXTEND_VECTOR_INREG;
6575 case ISD::SIGN_EXTEND:
6576 case ISD::SIGN_EXTEND_VECTOR_INREG:
6577 return ISD::SIGN_EXTEND_VECTOR_INREG;
6578 }
6579 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6579)
;
6580}
6581
6582static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
6583 SDValue In, SelectionDAG &DAG) {
6584 EVT InVT = In.getValueType();
6585 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6585, __extension__ __PRETTY_FUNCTION__))
;
6586 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6588, __extension__ __PRETTY_FUNCTION__))
6587 ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6588, __extension__ __PRETTY_FUNCTION__))
6588 "Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6588, __extension__ __PRETTY_FUNCTION__))
;
6589
6590 // For 256-bit vectors, we only need the lower (128-bit) input half.
6591 // For 512-bit vectors, we only need the lower input half or quarter.
6592 if (InVT.getSizeInBits() > 128) {
6593 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6594, __extension__ __PRETTY_FUNCTION__))
6594 "Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6594, __extension__ __PRETTY_FUNCTION__))
;
6595 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6596 In = extractSubVector(In, 0, DAG, DL,
6597 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6598 InVT = In.getValueType();
6599 }
6600
6601 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6602 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6603
6604 return DAG.getNode(Opcode, DL, VT, In);
6605}
6606
6607// Match (xor X, -1) -> X.
6608// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6609// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6610static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6611 V = peekThroughBitcasts(V);
6612 if (V.getOpcode() == ISD::XOR &&
6613 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6614 return V.getOperand(0);
6615 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6616 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6617 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6618 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6619 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6620 Not, V.getOperand(1));
6621 }
6622 }
6623 SmallVector<SDValue, 2> CatOps;
6624 if (collectConcatOps(V.getNode(), CatOps)) {
6625 for (SDValue &CatOp : CatOps) {
6626 SDValue NotCat = IsNOT(CatOp, DAG);
6627 if (!NotCat) return SDValue();
6628 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6629 }
6630 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6631 }
6632 return SDValue();
6633}
6634
6635void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
6636 bool Lo, bool Unary) {
6637 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6638, __extension__ __PRETTY_FUNCTION__))
6638 "Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6638, __extension__ __PRETTY_FUNCTION__))
;
6639 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6639, __extension__ __PRETTY_FUNCTION__))
;
6640 int NumElts = VT.getVectorNumElements();
6641 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
6642 for (int i = 0; i < NumElts; ++i) {
6643 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
6644 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
6645 Pos += (Unary ? 0 : NumElts * (i % 2));
6646 Pos += (Lo ? 0 : NumEltsInLane / 2);
6647 Mask.push_back(Pos);
6648 }
6649}
6650
6651/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
6652/// imposed by AVX and specific to the unary pattern. Example:
6653/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
6654/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
6655void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6656 bool Lo) {
6657 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6657, __extension__ __PRETTY_FUNCTION__))
;
6658 int NumElts = VT.getVectorNumElements();
6659 for (int i = 0; i < NumElts; ++i) {
6660 int Pos = i / 2;
6661 Pos += (Lo ? 0 : NumElts / 2);
6662 Mask.push_back(Pos);
6663 }
6664}
6665
6666/// Returns a vector_shuffle node for an unpackl operation.
6667static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6668 SDValue V1, SDValue V2) {
6669 SmallVector<int, 8> Mask;
6670 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6671 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6672}
6673
6674/// Returns a vector_shuffle node for an unpackh operation.
6675static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
6676 SDValue V1, SDValue V2) {
6677 SmallVector<int, 8> Mask;
6678 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6679 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6680}
6681
6682/// Return a vector_shuffle of the specified vector of zero or undef vector.
6683/// This produces a shuffle where the low element of V2 is swizzled into the
6684/// zero/undef vector, landing at element Idx.
6685/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
6686static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6687 bool IsZero,
6688 const X86Subtarget &Subtarget,
6689 SelectionDAG &DAG) {
6690 MVT VT = V2.getSimpleValueType();
6691 SDValue V1 = IsZero
6692 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6693 int NumElems = VT.getVectorNumElements();
6694 SmallVector<int, 16> MaskVec(NumElems);
6695 for (int i = 0; i != NumElems; ++i)
6696 // If this is the insertion idx, put the low elt of V2 here.
6697 MaskVec[i] = (i == Idx) ? NumElems : i;
6698 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6699}
6700
6701static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
6702 if (Ptr.getOpcode() == X86ISD::Wrapper ||
6703 Ptr.getOpcode() == X86ISD::WrapperRIP)
6704 Ptr = Ptr.getOperand(0);
6705
6706 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6707 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6708 return nullptr;
6709
6710 return CNode->getConstVal();
6711}
6712
6713static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6714 if (!Load || !ISD::isNormalLoad(Load))
6715 return nullptr;
6716 return getTargetConstantFromBasePtr(Load->getBasePtr());
6717}
6718
6719static const Constant *getTargetConstantFromNode(SDValue Op) {
6720 Op = peekThroughBitcasts(Op);
6721 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6722}
6723
6724const Constant *
6725X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6726 assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6726, __extension__ __PRETTY_FUNCTION__))
;
6727 return getTargetConstantFromNode(LD);
6728}
6729
6730// Extract raw constant bits from constant pools.
6731static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6732 APInt &UndefElts,
6733 SmallVectorImpl<APInt> &EltBits,
6734 bool AllowWholeUndefs = true,
6735 bool AllowPartialUndefs = true) {
6736 assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6736, __extension__ __PRETTY_FUNCTION__))
;
6737
6738 Op = peekThroughBitcasts(Op);
6739
6740 EVT VT = Op.getValueType();
6741 unsigned SizeInBits = VT.getSizeInBits();
6742 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6742, __extension__ __PRETTY_FUNCTION__))
;
6743 unsigned NumElts = SizeInBits / EltSizeInBits;
6744
6745 // Bitcast a source array of element bits to the target size.
6746 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6747 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6748 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6749 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6750, __extension__ __PRETTY_FUNCTION__))
6750 "Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6750, __extension__ __PRETTY_FUNCTION__))
;
6751
6752 // Don't split if we don't allow undef bits.
6753 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6754 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6755 return false;
6756
6757 // If we're already the right size, don't bother bitcasting.
6758 if (NumSrcElts == NumElts) {
6759 UndefElts = UndefSrcElts;
6760 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6761 return true;
6762 }
6763
6764 // Extract all the undef/constant element data and pack into single bitsets.
6765 APInt UndefBits(SizeInBits, 0);
6766 APInt MaskBits(SizeInBits, 0);
6767
6768 for (unsigned i = 0; i != NumSrcElts; ++i) {
6769 unsigned BitOffset = i * SrcEltSizeInBits;
6770 if (UndefSrcElts[i])
6771 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6772 MaskBits.insertBits(SrcEltBits[i], BitOffset);
6773 }
6774
6775 // Split the undef/constant single bitset data into the target elements.
6776 UndefElts = APInt(NumElts, 0);
6777 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6778
6779 for (unsigned i = 0; i != NumElts; ++i) {
6780 unsigned BitOffset = i * EltSizeInBits;
6781 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6782
6783 // Only treat an element as UNDEF if all bits are UNDEF.
6784 if (UndefEltBits.isAllOnesValue()) {
6785 if (!AllowWholeUndefs)
6786 return false;
6787 UndefElts.setBit(i);
6788 continue;
6789 }
6790
6791 // If only some bits are UNDEF then treat them as zero (or bail if not
6792 // supported).
6793 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6794 return false;
6795
6796 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6797 }
6798 return true;
6799 };
6800
6801 // Collect constant bits and insert into mask/undef bit masks.
6802 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6803 unsigned UndefBitIndex) {
6804 if (!Cst)
6805 return false;
6806 if (isa<UndefValue>(Cst)) {
6807 Undefs.setBit(UndefBitIndex);
6808 return true;
6809 }
6810 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6811 Mask = CInt->getValue();
6812 return true;
6813 }
6814 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6815 Mask = CFP->getValueAPF().bitcastToAPInt();
6816 return true;
6817 }
6818 return false;
6819 };
6820
6821 // Handle UNDEFs.
6822 if (Op.isUndef()) {
6823 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6824 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6825 return CastBitData(UndefSrcElts, SrcEltBits);
6826 }
6827
6828 // Extract scalar constant bits.
6829 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6830 APInt UndefSrcElts = APInt::getNullValue(1);
6831 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6832 return CastBitData(UndefSrcElts, SrcEltBits);
6833 }
6834 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6835 APInt UndefSrcElts = APInt::getNullValue(1);
6836 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6837 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6838 return CastBitData(UndefSrcElts, SrcEltBits);
6839 }
6840
6841 // Extract constant bits from build vector.
6842 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6843 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6844 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6845
6846 APInt UndefSrcElts(NumSrcElts, 0);
6847 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6848 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6849 const SDValue &Src = Op.getOperand(i);
6850 if (Src.isUndef()) {
6851 UndefSrcElts.setBit(i);
6852 continue;
6853 }
6854 auto *Cst = cast<ConstantSDNode>(Src);
6855 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6856 }
6857 return CastBitData(UndefSrcElts, SrcEltBits);
6858 }
6859 if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6860 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6861 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6862
6863 APInt UndefSrcElts(NumSrcElts, 0);
6864 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6865 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6866 const SDValue &Src = Op.getOperand(i);
6867 if (Src.isUndef()) {
6868 UndefSrcElts.setBit(i);
6869 continue;
6870 }
6871 auto *Cst = cast<ConstantFPSDNode>(Src);
6872 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6873 SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6874 }
6875 return CastBitData(UndefSrcElts, SrcEltBits);
6876 }
6877
6878 // Extract constant bits from constant pool vector.
6879 if (auto *Cst = getTargetConstantFromNode(Op)) {
6880 Type *CstTy = Cst->getType();
6881 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6882 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6883 return false;
6884
6885 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6886 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6887
6888 APInt UndefSrcElts(NumSrcElts, 0);
6889 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6890 for (unsigned i = 0; i != NumSrcElts; ++i)
6891 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6892 UndefSrcElts, i))
6893 return false;
6894
6895 return CastBitData(UndefSrcElts, SrcEltBits);
6896 }
6897
6898 // Extract constant bits from a broadcasted constant pool scalar.
6899 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6900 EltSizeInBits <= VT.getScalarSizeInBits()) {
6901 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6902 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6903 return false;
6904
6905 SDValue Ptr = MemIntr->getBasePtr();
6906 if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
6907 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6908 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6909
6910 APInt UndefSrcElts(NumSrcElts, 0);
6911 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6912 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6913 if (UndefSrcElts[0])
6914 UndefSrcElts.setBits(0, NumSrcElts);
6915 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6916 return CastBitData(UndefSrcElts, SrcEltBits);
6917 }
6918 }
6919 }
6920
6921 // Extract constant bits from a subvector broadcast.
6922 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
6923 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6924 SDValue Ptr = MemIntr->getBasePtr();
6925 // The source constant may be larger than the subvector broadcast,
6926 // ensure we extract the correct subvector constants.
6927 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
6928 Type *CstTy = Cst->getType();
6929 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6930 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
6931 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
6932 (SizeInBits % SubVecSizeInBits) != 0)
6933 return false;
6934 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
6935 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
6936 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
6937 APInt UndefSubElts(NumSubElts, 0);
6938 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
6939 APInt(CstEltSizeInBits, 0));
6940 for (unsigned i = 0; i != NumSubElts; ++i) {
6941 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
6942 UndefSubElts, i))
6943 return false;
6944 for (unsigned j = 1; j != NumSubVecs; ++j)
6945 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
6946 }
6947 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
6948 UndefSubElts);
6949 return CastBitData(UndefSubElts, SubEltBits);
6950 }
6951 }
6952
6953 // Extract a rematerialized scalar constant insertion.
6954 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6955 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6956 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6957 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6958 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6959
6960 APInt UndefSrcElts(NumSrcElts, 0);
6961 SmallVector<APInt, 64> SrcEltBits;
6962 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6963 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6964 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6965 return CastBitData(UndefSrcElts, SrcEltBits);
6966 }
6967
6968 // Insert constant bits from a base and sub vector sources.
6969 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
6970 // If bitcasts to larger elements we might lose track of undefs - don't
6971 // allow any to be safe.
6972 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6973 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
6974
6975 APInt UndefSrcElts, UndefSubElts;
6976 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
6977 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
6978 UndefSubElts, EltSubBits,
6979 AllowWholeUndefs && AllowUndefs,
6980 AllowPartialUndefs && AllowUndefs) &&
6981 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
6982 UndefSrcElts, EltSrcBits,
6983 AllowWholeUndefs && AllowUndefs,
6984 AllowPartialUndefs && AllowUndefs)) {
6985 unsigned BaseIdx = Op.getConstantOperandVal(2);
6986 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
6987 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6988 EltSrcBits[BaseIdx + i] = EltSubBits[i];
6989 return CastBitData(UndefSrcElts, EltSrcBits);
6990 }
6991 }
6992
6993 // Extract constant bits from a subvector's source.
6994 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
6995 // TODO - support extract_subvector through bitcasts.
6996 if (EltSizeInBits != VT.getScalarSizeInBits())
6997 return false;
6998
6999 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7000 UndefElts, EltBits, AllowWholeUndefs,
7001 AllowPartialUndefs)) {
7002 EVT SrcVT = Op.getOperand(0).getValueType();
7003 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7004 unsigned NumSubElts = VT.getVectorNumElements();
7005 unsigned BaseIdx = Op.getConstantOperandVal(1);
7006 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
7007 if ((BaseIdx + NumSubElts) != NumSrcElts)
7008 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
7009 if (BaseIdx != 0)
7010 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
7011 return true;
7012 }
7013 }
7014
7015 // Extract constant bits from shuffle node sources.
7016 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
7017 // TODO - support shuffle through bitcasts.
7018 if (EltSizeInBits != VT.getScalarSizeInBits())
7019 return false;
7020
7021 ArrayRef<int> Mask = SVN->getMask();
7022 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
7023 llvm::any_of(Mask, [](int M) { return M < 0; }))
7024 return false;
7025
7026 APInt UndefElts0, UndefElts1;
7027 SmallVector<APInt, 32> EltBits0, EltBits1;
7028 if (isAnyInRange(Mask, 0, NumElts) &&
7029 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
7030 UndefElts0, EltBits0, AllowWholeUndefs,
7031 AllowPartialUndefs))
7032 return false;
7033 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
7034 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
7035 UndefElts1, EltBits1, AllowWholeUndefs,
7036 AllowPartialUndefs))
7037 return false;
7038
7039 UndefElts = APInt::getNullValue(NumElts);
7040 for (int i = 0; i != (int)NumElts; ++i) {
7041 int M = Mask[i];
7042 if (M < 0) {
7043 UndefElts.setBit(i);
7044 EltBits.push_back(APInt::getNullValue(EltSizeInBits));
7045 } else if (M < (int)NumElts) {
7046 if (UndefElts0[M])
7047 UndefElts.setBit(i);
7048 EltBits.push_back(EltBits0[M]);
7049 } else {
7050 if (UndefElts1[M - NumElts])
7051 UndefElts.setBit(i);
7052 EltBits.push_back(EltBits1[M - NumElts]);
7053 }
7054 }
7055 return true;
7056 }
7057
7058 return false;
7059}
7060
7061namespace llvm {
7062namespace X86 {
7063bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
7064 APInt UndefElts;
7065 SmallVector<APInt, 16> EltBits;
7066 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
7067 UndefElts, EltBits, true,
7068 AllowPartialUndefs)) {
7069 int SplatIndex = -1;
7070 for (int i = 0, e = EltBits.size(); i != e; ++i) {
7071 if (UndefElts[i])
7072 continue;
7073 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
7074 SplatIndex = -1;
7075 break;
7076 }
7077 SplatIndex = i;
7078 }
7079 if (0 <= SplatIndex) {
7080 SplatVal = EltBits[SplatIndex];
7081 return true;
7082 }
7083 }
7084
7085 return false;
7086}
7087} // namespace X86
7088} // namespace llvm
7089
7090static bool getTargetShuffleMaskIndices(SDValue MaskNode,
7091 unsigned MaskEltSizeInBits,
7092 SmallVectorImpl<uint64_t> &RawMask,
7093 APInt &UndefElts) {
7094 // Extract the raw target constant bits.
7095 SmallVector<APInt, 64> EltBits;
7096 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
7097 EltBits, /* AllowWholeUndefs */ true,
7098 /* AllowPartialUndefs */ false))
7099 return false;
7100
7101 // Insert the extracted elements into the mask.
7102 for (const APInt &Elt : EltBits)
7103 RawMask.push_back(Elt.getZExtValue());
7104
7105 return true;
7106}
7107
7108/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
7109/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
7110/// Note: This ignores saturation, so inputs must be checked first.
7111static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
7112 bool Unary, unsigned NumStages = 1) {
7113 assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7113, __extension__ __PRETTY_FUNCTION__))
;
7114 unsigned NumElts = VT.getVectorNumElements();
7115 unsigned NumLanes = VT.getSizeInBits() / 128;
7116 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
7117 unsigned Offset = Unary ? 0 : NumElts;
7118 unsigned Repetitions = 1u << (NumStages - 1);
7119 unsigned Increment = 1u << NumStages;
7120 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7120, __extension__ __PRETTY_FUNCTION__))
;
7121
7122 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
7123 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
7124 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7125 Mask.push_back(Elt + (Lane * NumEltsPerLane));
7126 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
7127 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
7128 }
7129 }
7130}
7131
7132// Split the demanded elts of a PACKSS/PACKUS node between its operands.
7133static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
7134 APInt &DemandedLHS, APInt &DemandedRHS) {
7135 int NumLanes = VT.getSizeInBits() / 128;
7136 int NumElts = DemandedElts.getBitWidth();
7137 int NumInnerElts = NumElts / 2;
7138 int NumEltsPerLane = NumElts / NumLanes;
7139 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
7140
7141 DemandedLHS = APInt::getNullValue(NumInnerElts);
7142 DemandedRHS = APInt::getNullValue(NumInnerElts);
7143
7144 // Map DemandedElts to the packed operands.
7145 for (int Lane = 0; Lane != NumLanes; ++Lane) {
7146 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
7147 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
7148 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
7149 if (DemandedElts[OuterIdx])
7150 DemandedLHS.setBit(InnerIdx);
7151 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
7152 DemandedRHS.setBit(InnerIdx);
7153 }
7154 }
7155}
7156
7157// Split the demanded elts of a HADD/HSUB node between its operands.
7158static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
7159 APInt &DemandedLHS, APInt &DemandedRHS) {
7160 int NumLanes = VT.getSizeInBits() / 128;
7161 int NumElts = DemandedElts.getBitWidth();
7162 int NumEltsPerLane = NumElts / NumLanes;
7163 int HalfEltsPerLane = NumEltsPerLane / 2;
7164
7165 DemandedLHS = APInt::getNullValue(NumElts);
7166 DemandedRHS = APInt::getNullValue(NumElts);
7167
7168 // Map DemandedElts to the horizontal operands.
7169 for (int Idx = 0; Idx != NumElts; ++Idx) {
7170 if (!DemandedElts[Idx])
7171 continue;
7172 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
7173 int LocalIdx = Idx % NumEltsPerLane;
7174 if (LocalIdx < HalfEltsPerLane) {
7175 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7176 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7177 } else {
7178 LocalIdx -= HalfEltsPerLane;
7179 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
7180 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
7181 }
7182 }
7183}
7184
7185/// Calculates the shuffle mask corresponding to the target-specific opcode.
7186/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
7187/// operands in \p Ops, and returns true.
7188/// Sets \p IsUnary to true if only one source is used. Note that this will set
7189/// IsUnary for shuffles which use a single input multiple times, and in those
7190/// cases it will adjust the mask to only have indices within that single input.
7191/// It is an error to call this with non-empty Mask/Ops vectors.
7192static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7193 SmallVectorImpl<SDValue> &Ops,
7194 SmallVectorImpl<int> &Mask, bool &IsUnary) {
7195 unsigned NumElems = VT.getVectorNumElements();
7196 unsigned MaskEltSize = VT.getScalarSizeInBits();
7197 SmallVector<uint64_t, 32> RawMask;
7198 APInt RawUndefs;
7199 uint64_t ImmN;
7200
7201 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7201, __extension__ __PRETTY_FUNCTION__))
;
7202 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7202, __extension__ __PRETTY_FUNCTION__))
;
7203
7204 IsUnary = false;
7205 bool IsFakeUnary = false;
7206 switch (N->getOpcode()) {
7207 case X86ISD::BLENDI:
7208 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7208, __extension__ __PRETTY_FUNCTION__))
;
7209 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7209, __extension__ __PRETTY_FUNCTION__))
;
7210 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7211 DecodeBLENDMask(NumElems, ImmN, Mask);
7212 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7213 break;
7214 case X86ISD::SHUFP:
7215 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7215, __extension__ __PRETTY_FUNCTION__))
;
7216 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7216, __extension__ __PRETTY_FUNCTION__))
;
7217 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7218 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
7219 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7220 break;
7221 case X86ISD::INSERTPS:
7222 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7222, __extension__ __PRETTY_FUNCTION__))
;
7223 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7223, __extension__ __PRETTY_FUNCTION__))
;
7224 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7225 DecodeINSERTPSMask(ImmN, Mask);
7226 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7227 break;
7228 case X86ISD::EXTRQI:
7229 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7229, __extension__ __PRETTY_FUNCTION__))
;
7230 if (isa<ConstantSDNode>(N->getOperand(1)) &&
7231 isa<ConstantSDNode>(N->getOperand(2))) {
7232 int BitLen = N->getConstantOperandVal(1);
7233 int BitIdx = N->getConstantOperandVal(2);
7234 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7235 IsUnary = true;
7236 }
7237 break;
7238 case X86ISD::INSERTQI:
7239 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7239, __extension__ __PRETTY_FUNCTION__))
;
7240 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7240, __extension__ __PRETTY_FUNCTION__))
;
7241 if (isa<ConstantSDNode>(N->getOperand(2)) &&
7242 isa<ConstantSDNode>(N->getOperand(3))) {
7243 int BitLen = N->getConstantOperandVal(2);
7244 int BitIdx = N->getConstantOperandVal(3);
7245 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
7246 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7247 }
7248 break;
7249 case X86ISD::UNPCKH:
7250 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7250, __extension__ __PRETTY_FUNCTION__))
;
7251 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7251, __extension__ __PRETTY_FUNCTION__))
;
7252 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
7253 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7254 break;
7255 case X86ISD::UNPCKL:
7256 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7256, __extension__ __PRETTY_FUNCTION__))
;
7257 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7257, __extension__ __PRETTY_FUNCTION__))
;
7258 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
7259 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7260 break;
7261 case X86ISD::MOVHLPS:
7262 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7262, __extension__ __PRETTY_FUNCTION__))
;
7263 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7263, __extension__ __PRETTY_FUNCTION__))
;
7264 DecodeMOVHLPSMask(NumElems, Mask);
7265 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7266 break;
7267 case X86ISD::MOVLHPS:
7268 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7268, __extension__ __PRETTY_FUNCTION__))
;
7269 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7269, __extension__ __PRETTY_FUNCTION__))
;
7270 DecodeMOVLHPSMask(NumElems, Mask);
7271 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7272 break;
7273 case X86ISD::VALIGN:
7274 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7275, __extension__ __PRETTY_FUNCTION__))
7275 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7275, __extension__ __PRETTY_FUNCTION__))
;
7276 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7276, __extension__ __PRETTY_FUNCTION__))
;
7277 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7277, __extension__ __PRETTY_FUNCTION__))
;
7278 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7279 DecodeVALIGNMask(NumElems, ImmN, Mask);
7280 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7281 Ops.push_back(N->getOperand(1));
7282 Ops.push_back(N->getOperand(0));
7283 break;
7284 case X86ISD::PALIGNR:
7285 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7285, __extension__ __PRETTY_FUNCTION__))
;
7286 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7286, __extension__ __PRETTY_FUNCTION__))
;
7287 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7287, __extension__ __PRETTY_FUNCTION__))
;
7288 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7289 DecodePALIGNRMask(NumElems, ImmN, Mask);
7290 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7291 Ops.push_back(N->getOperand(1));
7292 Ops.push_back(N->getOperand(0));
7293 break;
7294 case X86ISD::VSHLDQ:
7295 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7295, __extension__ __PRETTY_FUNCTION__))
;
7296 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7296, __extension__ __PRETTY_FUNCTION__))
;
7297 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7298 DecodePSLLDQMask(NumElems, ImmN, Mask);
7299 IsUnary = true;
7300 break;
7301 case X86ISD::VSRLDQ:
7302 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7302, __extension__ __PRETTY_FUNCTION__))
;
7303 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7303, __extension__ __PRETTY_FUNCTION__))
;
7304 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7305 DecodePSRLDQMask(NumElems, ImmN, Mask);
7306 IsUnary = true;
7307 break;
7308 case X86ISD::PSHUFD:
7309 case X86ISD::VPERMILPI:
7310 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7310, __extension__ __PRETTY_FUNCTION__))
;
7311 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7312 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
7313 IsUnary = true;
7314 break;
7315 case X86ISD::PSHUFHW:
7316 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7316, __extension__ __PRETTY_FUNCTION__))
;
7317 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7318 DecodePSHUFHWMask(NumElems, ImmN, Mask);
7319 IsUnary = true;
7320 break;
7321 case X86ISD::PSHUFLW:
7322 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7322, __extension__ __PRETTY_FUNCTION__))
;
7323 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7324 DecodePSHUFLWMask(NumElems, ImmN, Mask);
7325 IsUnary = true;
7326 break;
7327 case X86ISD::VZEXT_MOVL:
7328 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7328, __extension__ __PRETTY_FUNCTION__))
;
7329 DecodeZeroMoveLowMask(NumElems, Mask);
7330 IsUnary = true;
7331 break;
7332 case X86ISD::VBROADCAST:
7333 // We only decode broadcasts of same-sized vectors, peeking through to
7334 // extracted subvectors is likely to cause hasOneUse issues with
7335 // SimplifyDemandedBits etc.
7336 if (N->getOperand(0).getValueType() == VT) {
7337 DecodeVectorBroadcast(NumElems, Mask);
7338 IsUnary = true;
7339 break;
7340 }
7341 return false;
7342 case X86ISD::VPERMILPV: {
7343 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7343, __extension__ __PRETTY_FUNCTION__))
;
7344 IsUnary = true;
7345 SDValue MaskNode = N->getOperand(1);
7346 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7347 RawUndefs)) {
7348 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
7349 break;
7350 }
7351 return false;
7352 }
7353 case X86ISD::PSHUFB: {
7354 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7354, __extension__ __PRETTY_FUNCTION__))
;
7355 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7355, __extension__ __PRETTY_FUNCTION__))
;
7356 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7356, __extension__ __PRETTY_FUNCTION__))
;
7357 IsUnary = true;
7358 SDValue MaskNode = N->getOperand(1);
7359 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7360 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
7361 break;
7362 }
7363 return false;
7364 }
7365 case X86ISD::VPERMI:
7366 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7366, __extension__ __PRETTY_FUNCTION__))
;
7367 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7368 DecodeVPERMMask(NumElems, ImmN, Mask);
7369 IsUnary = true;
7370 break;
7371 case X86ISD::MOVSS:
7372 case X86ISD::MOVSD:
7373 case X86ISD::MOVSH:
7374 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7374, __extension__ __PRETTY_FUNCTION__))
;
7375 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7375, __extension__ __PRETTY_FUNCTION__))
;
7376 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
7377 break;
7378 case X86ISD::VPERM2X128:
7379 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7379, __extension__ __PRETTY_FUNCTION__))
;
7380 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7380, __extension__ __PRETTY_FUNCTION__))
;
7381 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7382 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
7383 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7384 break;
7385 case X86ISD::SHUF128:
7386 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7386, __extension__ __PRETTY_FUNCTION__))
;
7387 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7387, __extension__ __PRETTY_FUNCTION__))
;
7388 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
7389 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
7390 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7391 break;
7392 case X86ISD::MOVSLDUP:
7393 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7393, __extension__ __PRETTY_FUNCTION__))
;
7394 DecodeMOVSLDUPMask(NumElems, Mask);
7395 IsUnary = true;
7396 break;
7397 case X86ISD::MOVSHDUP:
7398 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7398, __extension__ __PRETTY_FUNCTION__))
;
7399 DecodeMOVSHDUPMask(NumElems, Mask);
7400 IsUnary = true;
7401 break;
7402 case X86ISD::MOVDDUP:
7403 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7403, __extension__ __PRETTY_FUNCTION__))
;
7404 DecodeMOVDDUPMask(NumElems, Mask);
7405 IsUnary = true;
7406 break;
7407 case X86ISD::VPERMIL2: {
7408 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7408, __extension__ __PRETTY_FUNCTION__))
;
7409 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7409, __extension__ __PRETTY_FUNCTION__))
;
7410 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7411 SDValue MaskNode = N->getOperand(2);
7412 SDValue CtrlNode = N->getOperand(3);
7413 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
7414 unsigned CtrlImm = CtrlOp->getZExtValue();
7415 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7416 RawUndefs)) {
7417 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
7418 Mask);
7419 break;
7420 }
7421 }
7422 return false;
7423 }
7424 case X86ISD::VPPERM: {
7425 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7425, __extension__ __PRETTY_FUNCTION__))
;
7426 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7426, __extension__ __PRETTY_FUNCTION__))
;
7427 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
7428 SDValue MaskNode = N->getOperand(2);
7429 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
7430 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
7431 break;
7432 }
7433 return false;
7434 }
7435 case X86ISD::VPERMV: {
7436 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7436, __extension__ __PRETTY_FUNCTION__))
;
7437 IsUnary = true;
7438 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
7439 Ops.push_back(N->getOperand(1));
7440 SDValue MaskNode = N->getOperand(0);
7441 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7442 RawUndefs)) {
7443 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
7444 break;
7445 }
7446 return false;
7447 }
7448 case X86ISD::VPERMV3: {
7449 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7449, __extension__ __PRETTY_FUNCTION__))
;
7450 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7450, __extension__ __PRETTY_FUNCTION__))
;
7451 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
7452 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
7453 Ops.push_back(N->getOperand(0));
7454 Ops.push_back(N->getOperand(2));
7455 SDValue MaskNode = N->getOperand(1);
7456 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
7457 RawUndefs)) {
7458 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
7459 break;
7460 }
7461 return false;
7462 }
7463 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7463)
;
7464 }
7465
7466 // Empty mask indicates the decode failed.
7467 if (Mask.empty())
7468 return false;
7469
7470 // Check if we're getting a shuffle mask with zero'd elements.
7471 if (!AllowSentinelZero && isAnyZero(Mask))
7472 return false;
7473
7474 // If we have a fake unary shuffle, the shuffle mask is spread across two
7475 // inputs that are actually the same node. Re-map the mask to always point
7476 // into the first input.
7477 if (IsFakeUnary)
7478 for (int &M : Mask)
7479 if (M >= (int)Mask.size())
7480 M -= Mask.size();
7481
7482 // If we didn't already add operands in the opcode-specific code, default to
7483 // adding 1 or 2 operands starting at 0.
7484 if (Ops.empty()) {
7485 Ops.push_back(N->getOperand(0));
7486 if (!IsUnary || IsFakeUnary)
7487 Ops.push_back(N->getOperand(1));
7488 }
7489
7490 return true;
7491}
7492
7493// Wrapper for getTargetShuffleMask with InUnary;
7494static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
7495 SmallVectorImpl<SDValue> &Ops,
7496 SmallVectorImpl<int> &Mask) {
7497 bool IsUnary;
7498 return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);
7499}
7500
7501/// Compute whether each element of a shuffle is zeroable.
7502///
7503/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7504/// Either it is an undef element in the shuffle mask, the element of the input
7505/// referenced is undef, or the element of the input referenced is known to be
7506/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7507/// as many lanes with this technique as possible to simplify the remaining
7508/// shuffle.
7509static void computeZeroableShuffleElements(ArrayRef<int> Mask,
7510 SDValue V1, SDValue V2,
7511 APInt &KnownUndef, APInt &KnownZero) {
7512 int Size = Mask.size();
7513 KnownUndef = KnownZero = APInt::getNullValue(Size);
7514
7515 V1 = peekThroughBitcasts(V1);
7516 V2 = peekThroughBitcasts(V2);
7517
7518 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7519 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7520
7521 int VectorSizeInBits = V1.getValueSizeInBits();
7522 int ScalarSizeInBits = VectorSizeInBits / Size;
7523 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7523, __extension__ __PRETTY_FUNCTION__))
;
7524
7525 for (int i = 0; i < Size; ++i) {
7526 int M = Mask[i];
7527 // Handle the easy cases.
7528 if (M < 0) {
7529 KnownUndef.setBit(i);
7530 continue;
7531 }
7532 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7533 KnownZero.setBit(i);
7534 continue;
7535 }
7536
7537 // Determine shuffle input and normalize the mask.
7538 SDValue V = M < Size ? V1 : V2;
7539 M %= Size;
7540
7541 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7542 if (V.getOpcode() != ISD::BUILD_VECTOR)
7543 continue;
7544
7545 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7546 // the (larger) source element must be UNDEF/ZERO.
7547 if ((Size % V.getNumOperands()) == 0) {
7548 int Scale = Size / V->getNumOperands();
7549 SDValue Op = V.getOperand(M / Scale);
7550 if (Op.isUndef())
7551 KnownUndef.setBit(i);
7552 if (X86::isZeroNode(Op))
7553 KnownZero.setBit(i);
7554 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7555 APInt Val = Cst->getAPIntValue();
7556 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7557 if (Val == 0)
7558 KnownZero.setBit(i);
7559 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7560 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7561 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7562 if (Val == 0)
7563 KnownZero.setBit(i);
7564 }
7565 continue;
7566 }
7567
7568 // If the BUILD_VECTOR has more elements then all the (smaller) source
7569 // elements must be UNDEF or ZERO.
7570 if ((V.getNumOperands() % Size) == 0) {
7571 int Scale = V->getNumOperands() / Size;
7572 bool AllUndef = true;
7573 bool AllZero = true;
7574 for (int j = 0; j < Scale; ++j) {
7575 SDValue Op = V.getOperand((M * Scale) + j);
7576 AllUndef &= Op.isUndef();
7577 AllZero &= X86::isZeroNode(Op);
7578 }
7579 if (AllUndef)
7580 KnownUndef.setBit(i);
7581 if (AllZero)
7582 KnownZero.setBit(i);
7583 continue;
7584 }
7585 }
7586}
7587
7588/// Decode a target shuffle mask and inputs and see if any values are
7589/// known to be undef or zero from their inputs.
7590/// Returns true if the target shuffle mask was decoded.
7591/// FIXME: Merge this with computeZeroableShuffleElements?
7592static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7593 SmallVectorImpl<SDValue> &Ops,
7594 APInt &KnownUndef, APInt &KnownZero) {
7595 bool IsUnary;
7596 if (!isTargetShuffle(N.getOpcode()))
7597 return false;
7598
7599 MVT VT = N.getSimpleValueType();
7600 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7601 return false;
7602
7603 int Size = Mask.size();
7604 SDValue V1 = Ops[0];
7605 SDValue V2 = IsUnary ? V1 : Ops[1];
7606 KnownUndef = KnownZero = APInt::getNullValue(Size);
7607
7608 V1 = peekThroughBitcasts(V1);
7609 V2 = peekThroughBitcasts(V2);
7610
7611 assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7612, __extension__ __PRETTY_FUNCTION__))
7612 "Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7612, __extension__ __PRETTY_FUNCTION__))
;
7613 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7614
7615 // Extract known constant input data.
7616 APInt UndefSrcElts[2];
7617 SmallVector<APInt, 32> SrcEltBits[2];
7618 bool IsSrcConstant[2] = {
7619 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7620 SrcEltBits[0], true, false),
7621 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7622 SrcEltBits[1], true, false)};
7623
7624 for (int i = 0; i < Size; ++i) {
7625 int M = Mask[i];
7626
7627 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7628 if (M < 0) {
7629 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7629, __extension__ __PRETTY_FUNCTION__))
;
7630 if (SM_SentinelUndef == M)
7631 KnownUndef.setBit(i);
7632 if (SM_SentinelZero == M)
7633 KnownZero.setBit(i);
7634 continue;
7635 }
7636
7637 // Determine shuffle input and normalize the mask.
7638 unsigned SrcIdx = M / Size;
7639 SDValue V = M < Size ? V1 : V2;
7640 M %= Size;
7641
7642 // We are referencing an UNDEF input.
7643 if (V.isUndef()) {
7644 KnownUndef.setBit(i);
7645 continue;
7646 }
7647
7648 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7649 // TODO: We currently only set UNDEF for integer types - floats use the same
7650 // registers as vectors and many of the scalar folded loads rely on the
7651 // SCALAR_TO_VECTOR pattern.
7652 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7653 (Size % V.getValueType().getVectorNumElements()) == 0) {
7654 int Scale = Size / V.getValueType().getVectorNumElements();
7655 int Idx = M / Scale;
7656 if (Idx != 0 && !VT.isFloatingPoint())
7657 KnownUndef.setBit(i);
7658 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7659 KnownZero.setBit(i);
7660 continue;
7661 }
7662
7663 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
7664 // base vectors.
7665 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
7666 SDValue Vec = V.getOperand(0);
7667 int NumVecElts = Vec.getValueType().getVectorNumElements();
7668 if (Vec.isUndef() && Size == NumVecElts) {
7669 int Idx = V.getConstantOperandVal(2);
7670 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
7671 if (M < Idx || (Idx + NumSubElts) <= M)
7672 KnownUndef.setBit(i);
7673 }
7674 continue;
7675 }
7676
7677 // Attempt to extract from the source's constant bits.
7678 if (IsSrcConstant[SrcIdx]) {
7679 if (UndefSrcElts[SrcIdx][M])
7680 KnownUndef.setBit(i);
7681 else if (SrcEltBits[SrcIdx][M] == 0)
7682 KnownZero.setBit(i);
7683 }
7684 }
7685
7686 assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7687, __extension__ __PRETTY_FUNCTION__))
7687 "Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7687, __extension__ __PRETTY_FUNCTION__))
;
7688 return true;
7689}
7690
7691// Replace target shuffle mask elements with known undef/zero sentinels.
7692static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7693 const APInt &KnownUndef,
7694 const APInt &KnownZero,
7695 bool ResolveKnownZeros= true) {
7696 unsigned NumElts = Mask.size();
7697 assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7698, __extension__ __PRETTY_FUNCTION__))
7698 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7698, __extension__ __PRETTY_FUNCTION__))
;
7699
7700 for (unsigned i = 0; i != NumElts; ++i) {
7701 if (KnownUndef[i])
7702 Mask[i] = SM_SentinelUndef;
7703 else if (ResolveKnownZeros && KnownZero[i])
7704 Mask[i] = SM_SentinelZero;
7705 }
7706}
7707
7708// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7709static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7710 APInt &KnownUndef,
7711 APInt &KnownZero) {
7712 unsigned NumElts = Mask.size();
7713 KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7714
7715 for (unsigned i = 0; i != NumElts; ++i) {
7716 int M = Mask[i];
7717 if (SM_SentinelUndef == M)
7718 KnownUndef.setBit(i);
7719 if (SM_SentinelZero == M)
7720 KnownZero.setBit(i);
7721 }
7722}
7723
7724// Forward declaration (for getFauxShuffleMask recursive check).
7725// TODO: Use DemandedElts variant.
7726static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7727 SmallVectorImpl<int> &Mask,
7728 const SelectionDAG &DAG, unsigned Depth,
7729 bool ResolveKnownElts);
7730
7731// Attempt to decode ops that could be represented as a shuffle mask.
7732// The decoded shuffle mask may contain a different number of elements to the
7733// destination value type.
7734static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7735 SmallVectorImpl<int> &Mask,
7736 SmallVectorImpl<SDValue> &Ops,
7737 const SelectionDAG &DAG, unsigned Depth,
7738 bool ResolveKnownElts) {
7739 Mask.clear();
7740 Ops.clear();
7741
7742 MVT VT = N.getSimpleValueType();
7743 unsigned NumElts = VT.getVectorNumElements();
7744 unsigned NumSizeInBits = VT.getSizeInBits();
7745 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7746 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7747 return false;
7748 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7748, __extension__ __PRETTY_FUNCTION__))
;
7749 unsigned NumSizeInBytes = NumSizeInBits / 8;
7750 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7751
7752 unsigned Opcode = N.getOpcode();
7753 switch (Opcode) {
7754 case ISD::VECTOR_SHUFFLE: {
7755 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7756 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7757 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7758 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7759 Ops.push_back(N.getOperand(0));
7760 Ops.push_back(N.getOperand(1));
7761 return true;
7762 }
7763 return false;
7764 }
7765 case ISD::AND:
7766 case X86ISD::ANDNP: {
7767 // Attempt to decode as a per-byte mask.
7768 APInt UndefElts;
7769 SmallVector<APInt, 32> EltBits;
7770 SDValue N0 = N.getOperand(0);
7771 SDValue N1 = N.getOperand(1);
7772 bool IsAndN = (X86ISD::ANDNP == Opcode);
7773 uint64_t ZeroMask = IsAndN ? 255 : 0;
7774 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7775 return false;
7776 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7777 if (UndefElts[i]) {
7778 Mask.push_back(SM_SentinelUndef);
7779 continue;
7780 }
7781 const APInt &ByteBits = EltBits[i];
7782 if (ByteBits != 0 && ByteBits != 255)
7783 return false;
7784 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7785 }
7786 Ops.push_back(IsAndN ? N1 : N0);
7787 return true;
7788 }
7789 case ISD::OR: {
7790 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7791 // is a valid shuffle index.
7792 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
7793 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
7794 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7795 return false;
7796 SmallVector<int, 64> SrcMask0, SrcMask1;
7797 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7798 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7799 true) ||
7800 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7801 true))
7802 return false;
7803
7804 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7805 SmallVector<int, 64> Mask0, Mask1;
7806 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7807 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7808 for (int i = 0; i != (int)MaskSize; ++i) {
7809 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
7810 // loops converting between OR and BLEND shuffles due to
7811 // canWidenShuffleElements merging away undef elements, meaning we
7812 // fail to recognise the OR as the undef element isn't known zero.
7813 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7814 Mask.push_back(SM_SentinelZero);
7815 else if (Mask1[i] == SM_SentinelZero)
7816 Mask.push_back(i);
7817 else if (Mask0[i] == SM_SentinelZero)
7818 Mask.push_back(i + MaskSize);
7819 else
7820 return false;
7821 }
7822 Ops.push_back(N0);
7823 Ops.push_back(N1);
7824 return true;
7825 }
7826 case ISD::INSERT_SUBVECTOR: {
7827 SDValue Src = N.getOperand(0);
7828 SDValue Sub = N.getOperand(1);
7829 EVT SubVT = Sub.getValueType();
7830 unsigned NumSubElts = SubVT.getVectorNumElements();
7831 if (!N->isOnlyUserOf(Sub.getNode()))
7832 return false;
7833 uint64_t InsertIdx = N.getConstantOperandVal(2);
7834 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7835 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7836 Sub.getOperand(0).getValueType() == VT) {
7837 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7838 for (int i = 0; i != (int)NumElts; ++i)
7839 Mask.push_back(i);
7840 for (int i = 0; i != (int)NumSubElts; ++i)
7841 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7842 Ops.push_back(Src);
7843 Ops.push_back(Sub.getOperand(0));
7844 return true;
7845 }
7846 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7847 SmallVector<int, 64> SubMask;
7848 SmallVector<SDValue, 2> SubInputs;
7849 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7850 SubMask, DAG, Depth + 1, ResolveKnownElts))
7851 return false;
7852
7853 // Subvector shuffle inputs must not be larger than the subvector.
7854 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
7855 return SubVT.getFixedSizeInBits() <
7856 SubInput.getValueSizeInBits().getFixedSize();
7857 }))
7858 return false;
7859
7860 if (SubMask.size() != NumSubElts) {
7861 assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7862, __extension__ __PRETTY_FUNCTION__))
7862 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7862, __extension__ __PRETTY_FUNCTION__))
;
7863 if ((NumSubElts % SubMask.size()) == 0) {
7864 int Scale = NumSubElts / SubMask.size();
7865 SmallVector<int,64> ScaledSubMask;
7866 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
7867 SubMask = ScaledSubMask;
7868 } else {
7869 int Scale = SubMask.size() / NumSubElts;
7870 NumSubElts = SubMask.size();
7871 NumElts *= Scale;
7872 InsertIdx *= Scale;
7873 }
7874 }
7875 Ops.push_back(Src);
7876 Ops.append(SubInputs.begin(), SubInputs.end());
7877 if (ISD::isBuildVectorAllZeros(Src.getNode()))
7878 Mask.append(NumElts, SM_SentinelZero);
7879 else
7880 for (int i = 0; i != (int)NumElts; ++i)
7881 Mask.push_back(i);
7882 for (int i = 0; i != (int)NumSubElts; ++i) {
7883 int M = SubMask[i];
7884 if (0 <= M) {
7885 int InputIdx = M / NumSubElts;
7886 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7887 }
7888 Mask[i + InsertIdx] = M;
7889 }
7890 return true;
7891 }
7892 case X86ISD::PINSRB:
7893 case X86ISD::PINSRW:
7894 case ISD::SCALAR_TO_VECTOR:
7895 case ISD::INSERT_VECTOR_ELT: {
7896 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
7897 // vector, for matching src/dst vector types.
7898 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
7899
7900 unsigned DstIdx = 0;
7901 if (Opcode != ISD::SCALAR_TO_VECTOR) {
7902 // Check we have an in-range constant insertion index.
7903 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7904 N.getConstantOperandAPInt(2).uge(NumElts))
7905 return false;
7906 DstIdx = N.getConstantOperandVal(2);
7907
7908 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
7909 if (X86::isZeroNode(Scl)) {
7910 Ops.push_back(N.getOperand(0));
7911 for (unsigned i = 0; i != NumElts; ++i)
7912 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
7913 return true;
7914 }
7915 }
7916
7917 // Peek through trunc/aext/zext.
7918 // TODO: aext shouldn't require SM_SentinelZero padding.
7919 // TODO: handle shift of scalars.
7920 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
7921 while (Scl.getOpcode() == ISD::TRUNCATE ||
7922 Scl.getOpcode() == ISD::ANY_EXTEND ||
7923 Scl.getOpcode() == ISD::ZERO_EXTEND) {
7924 Scl = Scl.getOperand(0);
7925 MinBitsPerElt =
7926 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
7927 }
7928 if ((MinBitsPerElt % 8) != 0)
7929 return false;
7930
7931 // Attempt to find the source vector the scalar was extracted from.
7932 SDValue SrcExtract;
7933 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
7934 Scl.getOpcode() == X86ISD::PEXTRW ||
7935 Scl.getOpcode() == X86ISD::PEXTRB) &&
7936 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
7937 SrcExtract = Scl;
7938 }
7939 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7940 return false;
7941
7942 SDValue SrcVec = SrcExtract.getOperand(0);
7943 EVT SrcVT = SrcVec.getValueType();
7944 if (!SrcVT.getScalarType().isByteSized())
7945 return false;
7946 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7947 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
7948 unsigned DstByte = DstIdx * NumBytesPerElt;
7949 MinBitsPerElt =
7950 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
7951
7952 // Create 'identity' byte level shuffle mask and then add inserted bytes.
7953 if (Opcode == ISD::SCALAR_TO_VECTOR) {
7954 Ops.push_back(SrcVec);
7955 Mask.append(NumSizeInBytes, SM_SentinelUndef);
7956 } else {
7957 Ops.push_back(SrcVec);
7958 Ops.push_back(N.getOperand(0));
7959 for (int i = 0; i != (int)NumSizeInBytes; ++i)
7960 Mask.push_back(NumSizeInBytes + i);
7961 }
7962
7963 unsigned MinBytesPerElts = MinBitsPerElt / 8;
7964 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
7965 for (unsigned i = 0; i != MinBytesPerElts; ++i)
7966 Mask[DstByte + i] = SrcByte + i;
7967 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
7968 Mask[DstByte + i] = SM_SentinelZero;
7969 return true;
7970 }
7971 case X86ISD::PACKSS:
7972 case X86ISD::PACKUS: {
7973 SDValue N0 = N.getOperand(0);
7974 SDValue N1 = N.getOperand(1);
7975 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7977, __extension__ __PRETTY_FUNCTION__))
7976 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7977, __extension__ __PRETTY_FUNCTION__))
7977 "Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7977, __extension__ __PRETTY_FUNCTION__))
;
7978
7979 APInt EltsLHS, EltsRHS;
7980 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7981
7982 // If we know input saturation won't happen (or we don't care for particular
7983 // lanes), we can treat this as a truncation shuffle.
7984 bool Offset0 = false, Offset1 = false;
7985 if (Opcode == X86ISD::PACKSS) {
7986 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
7987 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7988 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
7989 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7990 return false;
7991 // We can't easily fold ASHR into a shuffle, but if it was feeding a
7992 // PACKSS then it was likely being used for sign-extension for a
7993 // truncation, so just peek through and adjust the mask accordingly.
7994 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
7995 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
7996 Offset0 = true;
7997 N0 = N0.getOperand(0);
7998 }
7999 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
8000 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
8001 Offset1 = true;
8002 N1 = N1.getOperand(0);
8003 }
8004 } else {
8005 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
8006 if ((!(N0.isUndef() || EltsLHS.isNullValue()) &&
8007 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
8008 (!(N1.isUndef() || EltsRHS.isNullValue()) &&
8009 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
8010 return false;
8011 }
8012
8013 bool IsUnary = (N0 == N1);
8014
8015 Ops.push_back(N0);
8016 if (!IsUnary)
8017 Ops.push_back(N1);
8018
8019 createPackShuffleMask(VT, Mask, IsUnary);
8020
8021 if (Offset0 || Offset1) {
8022 for (int &M : Mask)
8023 if ((Offset0 && isInRange(M, 0, NumElts)) ||
8024 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
8025 ++M;
8026 }
8027 return true;
8028 }
8029 case X86ISD::VTRUNC: {
8030 SDValue Src = N.getOperand(0);
8031 EVT SrcVT = Src.getValueType();
8032 // Truncated source must be a simple vector.
8033 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8034 (SrcVT.getScalarSizeInBits() % 8) != 0)
8035 return false;
8036 unsigned NumSrcElts = SrcVT.getVectorNumElements();
8037 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
8038 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
8039 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8039, __extension__ __PRETTY_FUNCTION__))
;
8040 for (unsigned i = 0; i != NumSrcElts; ++i)
8041 Mask.push_back(i * Scale);
8042 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
8043 Ops.push_back(Src);
8044 return true;
8045 }
8046 case X86ISD::VSHLI:
8047 case X86ISD::VSRLI: {
8048 uint64_t ShiftVal = N.getConstantOperandVal(1);
8049 // Out of range bit shifts are guaranteed to be zero.
8050 if (NumBitsPerElt <= ShiftVal) {
8051 Mask.append(NumElts, SM_SentinelZero);
8052 return true;
8053 }
8054
8055 // We can only decode 'whole byte' bit shifts as shuffles.
8056 if ((ShiftVal % 8) != 0)
8057 break;
8058
8059 uint64_t ByteShift = ShiftVal / 8;
8060 Ops.push_back(N.getOperand(0));
8061
8062 // Clear mask to all zeros and insert the shifted byte indices.
8063 Mask.append(NumSizeInBytes, SM_SentinelZero);
8064
8065 if (X86ISD::VSHLI == Opcode) {
8066 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8067 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8068 Mask[i + j] = i + j - ByteShift;
8069 } else {
8070 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
8071 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
8072 Mask[i + j - ByteShift] = i + j;
8073 }
8074 return true;
8075 }
8076 case X86ISD::VROTLI:
8077 case X86ISD::VROTRI: {
8078 // We can only decode 'whole byte' bit rotates as shuffles.
8079 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
8080 if ((RotateVal % 8) != 0)
8081 return false;
8082 Ops.push_back(N.getOperand(0));
8083 int Offset = RotateVal / 8;
8084 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
8085 for (int i = 0; i != (int)NumElts; ++i) {
8086 int BaseIdx = i * NumBytesPerElt;
8087 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
8088 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
8089 }
8090 }
8091 return true;
8092 }
8093 case X86ISD::VBROADCAST: {
8094 SDValue Src = N.getOperand(0);
8095 if (!Src.getSimpleValueType().isVector()) {
8096 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8097 !isNullConstant(Src.getOperand(1)) ||
8098 Src.getOperand(0).getValueType().getScalarType() !=
8099 VT.getScalarType())
8100 return false;
8101 Src = Src.getOperand(0);
8102 }
8103 Ops.push_back(Src);
8104 Mask.append(NumElts, 0);
8105 return true;
8106 }
8107 case ISD::ZERO_EXTEND:
8108 case ISD::ANY_EXTEND:
8109 case ISD::ZERO_EXTEND_VECTOR_INREG:
8110 case ISD::ANY_EXTEND_VECTOR_INREG: {
8111 SDValue Src = N.getOperand(0);
8112 EVT SrcVT = Src.getValueType();
8113
8114 // Extended source must be a simple vector.
8115 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
8116 (SrcVT.getScalarSizeInBits() % 8) != 0)
8117 return false;
8118
8119 bool IsAnyExtend =
8120 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
8121 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
8122 IsAnyExtend, Mask);
8123 Ops.push_back(Src);
8124 return true;
8125 }
8126 }
8127
8128 return false;
8129}
8130
8131/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
8132static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
8133 SmallVectorImpl<int> &Mask) {
8134 int MaskWidth = Mask.size();
8135 SmallVector<SDValue, 16> UsedInputs;
8136 for (int i = 0, e = Inputs.size(); i < e; ++i) {
8137 int lo = UsedInputs.size() * MaskWidth;
8138 int hi = lo + MaskWidth;
8139
8140 // Strip UNDEF input usage.
8141 if (Inputs[i].isUndef())
8142 for (int &M : Mask)
8143 if ((lo <= M) && (M < hi))
8144 M = SM_SentinelUndef;
8145
8146 // Check for unused inputs.
8147 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
8148 for (int &M : Mask)
8149 if (lo <= M)
8150 M -= MaskWidth;
8151 continue;
8152 }
8153
8154 // Check for repeated inputs.
8155 bool IsRepeat = false;
8156 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
8157 if (UsedInputs[j] != Inputs[i])
8158 continue;
8159 for (int &M : Mask)
8160 if (lo <= M)
8161 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
8162 IsRepeat = true;
8163 break;
8164 }
8165 if (IsRepeat)
8166 continue;
8167
8168 UsedInputs.push_back(Inputs[i]);
8169 }
8170 Inputs = UsedInputs;
8171}
8172
8173/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
8174/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
8175/// Returns true if the target shuffle mask was decoded.
8176static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
8177 SmallVectorImpl<SDValue> &Inputs,
8178 SmallVectorImpl<int> &Mask,
8179 APInt &KnownUndef, APInt &KnownZero,
8180 const SelectionDAG &DAG, unsigned Depth,
8181 bool ResolveKnownElts) {
8182 EVT VT = Op.getValueType();
8183 if (!VT.isSimple() || !VT.isVector())
8184 return false;
8185
8186 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
8187 if (ResolveKnownElts)
8188 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
8189 return true;
8190 }
8191 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
8192 ResolveKnownElts)) {
8193 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
8194 return true;
8195 }
8196 return false;
8197}
8198
8199static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
8200 SmallVectorImpl<int> &Mask,
8201 const SelectionDAG &DAG, unsigned Depth = 0,
8202 bool ResolveKnownElts = true) {
8203 EVT VT = Op.getValueType();
8204 if (!VT.isSimple() || !VT.isVector())
8205 return false;
8206
8207 APInt KnownUndef, KnownZero;
8208 unsigned NumElts = Op.getValueType().getVectorNumElements();
8209 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
8210 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
8211 KnownZero, DAG, Depth, ResolveKnownElts);
8212}
8213
8214// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
8215static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
8216 EVT MemVT, MemSDNode *Mem, unsigned Offset,
8217 SelectionDAG &DAG) {
8218 assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8220, __extension__ __PRETTY_FUNCTION__))
8219 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8220, __extension__ __PRETTY_FUNCTION__))
8220 "Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8220, __extension__ __PRETTY_FUNCTION__))
;
8221
8222 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
8223 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
8224 return SDValue();
8225
8226 SDValue Ptr =
8227 DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);
8228 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8229 SDValue Ops[] = {Mem->getChain(), Ptr};
8230 SDValue BcstLd = DAG.getMemIntrinsicNode(
8231 Opcode, DL, Tys, Ops, MemVT,
8232 DAG.getMachineFunction().getMachineMemOperand(
8233 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
8234 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
8235 return BcstLd;
8236}
8237
8238/// Returns the scalar element that will make up the i'th
8239/// element of the result of the vector shuffle.
8240static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
8241 SelectionDAG &DAG, unsigned Depth) {
8242 if (Depth >= SelectionDAG::MaxRecursionDepth)
8243 return SDValue(); // Limit search depth.
8244
8245 EVT VT = Op.getValueType();
8246 unsigned Opcode = Op.getOpcode();
8247 unsigned NumElems = VT.getVectorNumElements();
8248
8249 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
8250 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
8251 int Elt = SV->getMaskElt(Index);
8252
8253 if (Elt < 0)
8254 return DAG.getUNDEF(VT.getVectorElementType());
8255
8256 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
8257 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8258 }
8259
8260 // Recurse into target specific vector shuffles to find scalars.
8261 if (isTargetShuffle(Opcode)) {
8262 MVT ShufVT = VT.getSimpleVT();
8263 MVT ShufSVT = ShufVT.getVectorElementType();
8264 int NumElems = (int)ShufVT.getVectorNumElements();
8265 SmallVector<int, 16> ShuffleMask;
8266 SmallVector<SDValue, 16> ShuffleOps;
8267 if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
8268 ShuffleMask))
8269 return SDValue();
8270
8271 int Elt = ShuffleMask[Index];
8272 if (Elt == SM_SentinelZero)
8273 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
8274 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
8275 if (Elt == SM_SentinelUndef)
8276 return DAG.getUNDEF(ShufSVT);
8277
8278 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8278, __extension__ __PRETTY_FUNCTION__))
;
8279 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
8280 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
8281 }
8282
8283 // Recurse into insert_subvector base/sub vector to find scalars.
8284 if (Opcode == ISD::INSERT_SUBVECTOR) {
8285 SDValue Vec = Op.getOperand(0);
8286 SDValue Sub = Op.getOperand(1);
8287 uint64_t SubIdx = Op.getConstantOperandVal(2);
8288 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
8289
8290 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
8291 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
8292 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
8293 }
8294
8295 // Recurse into concat_vectors sub vector to find scalars.
8296 if (Opcode == ISD::CONCAT_VECTORS) {
8297 EVT SubVT = Op.getOperand(0).getValueType();
8298 unsigned NumSubElts = SubVT.getVectorNumElements();
8299 uint64_t SubIdx = Index / NumSubElts;
8300 uint64_t SubElt = Index % NumSubElts;
8301 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
8302 }
8303
8304 // Recurse into extract_subvector src vector to find scalars.
8305 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
8306 SDValue Src = Op.getOperand(0);
8307 uint64_t SrcIdx = Op.getConstantOperandVal(1);
8308 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
8309 }
8310
8311 // We only peek through bitcasts of the same vector width.
8312 if (Opcode == ISD::BITCAST) {
8313 SDValue Src = Op.getOperand(0);
8314 EVT SrcVT = Src.getValueType();
8315 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
8316 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
8317 return SDValue();
8318 }
8319
8320 // Actual nodes that may contain scalar elements
8321
8322 // For insert_vector_elt - either return the index matching scalar or recurse
8323 // into the base vector.
8324 if (Opcode == ISD::INSERT_VECTOR_ELT &&
8325 isa<ConstantSDNode>(Op.getOperand(2))) {
8326 if (Op.getConstantOperandAPInt(2) == Index)
8327 return Op.getOperand(1);
8328 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
8329 }
8330
8331 if (Opcode == ISD::SCALAR_TO_VECTOR)
8332 return (Index == 0) ? Op.getOperand(0)
8333 : DAG.getUNDEF(VT.getVectorElementType());
8334
8335 if (Opcode == ISD::BUILD_VECTOR)
8336 return Op.getOperand(Index);
8337
8338 return SDValue();
8339}
8340
8341// Use PINSRB/PINSRW/PINSRD to create a build vector.
8342static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,
8343 unsigned NumNonZero, unsigned NumZero,
8344 SelectionDAG &DAG,
8345 const X86Subtarget &Subtarget) {
8346 MVT VT = Op.getSimpleValueType();
8347 unsigned NumElts = VT.getVectorNumElements();
8348 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8350, __extension__ __PRETTY_FUNCTION__))
8349 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8350, __extension__ __PRETTY_FUNCTION__))
8350 "Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8350, __extension__ __PRETTY_FUNCTION__))
;
8351
8352 SDLoc dl(Op);
8353 SDValue V;
8354 bool First = true;
8355
8356 for (unsigned i = 0; i < NumElts; ++i) {
8357 bool IsNonZero = NonZeroMask[i];
8358 if (!IsNonZero)
8359 continue;
8360
8361 // If the build vector contains zeros or our first insertion is not the
8362 // first index then insert into zero vector to break any register
8363 // dependency else use SCALAR_TO_VECTOR.
8364 if (First) {
8365 First = false;
8366 if (NumZero || 0 != i)
8367 V = getZeroVector(VT, Subtarget, DAG, dl);
8368 else {
8369 assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8369, __extension__ __PRETTY_FUNCTION__))
;
8370 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8371 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
8372 V = DAG.getBitcast(VT, V);
8373 continue;
8374 }
8375 }
8376 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
8377 DAG.getIntPtrConstant(i, dl));
8378 }
8379
8380 return V;
8381}
8382
8383/// Custom lower build_vector of v16i8.
8384static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,
8385 unsigned NumNonZero, unsigned NumZero,
8386 SelectionDAG &DAG,
8387 const X86Subtarget &Subtarget) {
8388 if (NumNonZero > 8 && !Subtarget.hasSSE41())
8389 return SDValue();
8390
8391 // SSE4.1 - use PINSRB to insert each byte directly.
8392 if (Subtarget.hasSSE41())
8393 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8394 Subtarget);
8395
8396 SDLoc dl(Op);
8397 SDValue V;
8398
8399 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
8400 for (unsigned i = 0; i < 16; i += 2) {
8401 bool ThisIsNonZero = NonZeroMask[i];
8402 bool NextIsNonZero = NonZeroMask[i + 1];
8403 if (!ThisIsNonZero && !NextIsNonZero)
8404 continue;
8405
8406 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
8407 SDValue Elt;
8408 if (ThisIsNonZero) {
8409 if (NumZero || NextIsNonZero)
8410 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8411 else
8412 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
8413 }
8414
8415 if (NextIsNonZero) {
8416 SDValue NextElt = Op.getOperand(i + 1);
8417 if (i == 0 && NumZero)
8418 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
8419 else
8420 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
8421 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
8422 DAG.getConstant(8, dl, MVT::i8));
8423 if (ThisIsNonZero)
8424 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
8425 else
8426 Elt = NextElt;
8427 }
8428
8429 // If our first insertion is not the first index or zeros are needed, then
8430 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
8431 // elements undefined).
8432 if (!V) {
8433 if (i != 0 || NumZero)
8434 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
8435 else {
8436 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
8437 V = DAG.getBitcast(MVT::v8i16, V);
8438 continue;
8439 }
8440 }
8441 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
8442 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
8443 DAG.getIntPtrConstant(i / 2, dl));
8444 }
8445
8446 return DAG.getBitcast(MVT::v16i8, V);
8447}
8448
8449/// Custom lower build_vector of v8i16.
8450static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,
8451 unsigned NumNonZero, unsigned NumZero,
8452 SelectionDAG &DAG,
8453 const X86Subtarget &Subtarget) {
8454 if (NumNonZero > 4 && !Subtarget.hasSSE41())
8455 return SDValue();
8456
8457 // Use PINSRW to insert each byte directly.
8458 return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,
8459 Subtarget);
8460}
8461
8462/// Custom lower build_vector of v4i32 or v4f32.
8463static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
8464 const X86Subtarget &Subtarget) {
8465 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
8466 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
8467 // Because we're creating a less complicated build vector here, we may enable
8468 // further folding of the MOVDDUP via shuffle transforms.
8469 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
8470 Op.getOperand(0) == Op.getOperand(2) &&
8471 Op.getOperand(1) == Op.getOperand(3) &&
8472 Op.getOperand(0) != Op.getOperand(1)) {
8473 SDLoc DL(Op);
8474 MVT VT = Op.getSimpleValueType();
8475 MVT EltVT = VT.getVectorElementType();
8476 // Create a new build vector with the first 2 elements followed by undef
8477 // padding, bitcast to v2f64, duplicate, and bitcast back.
8478 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8479 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8480 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
8481 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
8482 return DAG.getBitcast(VT, Dup);
8483 }
8484
8485 // Find all zeroable elements.
8486 std::bitset<4> Zeroable, Undefs;
8487 for (int i = 0; i < 4; ++i) {
8488 SDValue Elt = Op.getOperand(i);
8489 Undefs[i] = Elt.isUndef();
8490 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
8491 }
8492 assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8493, __extension__ __PRETTY_FUNCTION__))
8493 "We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8493, __extension__ __PRETTY_FUNCTION__))
;
8494
8495 // We only know how to deal with build_vector nodes where elements are either
8496 // zeroable or extract_vector_elt with constant index.
8497 SDValue FirstNonZero;
8498 unsigned FirstNonZeroIdx;
8499 for (unsigned i = 0; i < 4; ++i) {
8500 if (Zeroable[i])
8501 continue;
8502 SDValue Elt = Op.getOperand(i);
8503 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8504 !isa<ConstantSDNode>(Elt.getOperand(1)))
8505 return SDValue();
8506 // Make sure that this node is extracting from a 128-bit vector.
8507 MVT VT = Elt.getOperand(0).getSimpleValueType();
8508 if (!VT.is128BitVector())
8509 return SDValue();
8510 if (!FirstNonZero.getNode()) {
8511 FirstNonZero = Elt;
8512 FirstNonZeroIdx = i;
8513 }
8514 }
8515
8516 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8516, __extension__ __PRETTY_FUNCTION__))
;
8517 SDValue V1 = FirstNonZero.getOperand(0);
8518 MVT VT = V1.getSimpleValueType();
8519
8520 // See if this build_vector can be lowered as a blend with zero.
8521 SDValue Elt;
8522 unsigned EltMaskIdx, EltIdx;
8523 int Mask[4];
8524 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
8525 if (Zeroable[EltIdx]) {
8526 // The zero vector will be on the right hand side.
8527 Mask[EltIdx] = EltIdx+4;
8528 continue;
8529 }
8530
8531 Elt = Op->getOperand(EltIdx);
8532 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
8533 EltMaskIdx = Elt.getConstantOperandVal(1);
8534 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
8535 break;
8536 Mask[EltIdx] = EltIdx;
8537 }
8538
8539 if (EltIdx == 4) {
8540 // Let the shuffle legalizer deal with blend operations.
8541 SDValue VZeroOrUndef = (Zeroable == Undefs)
8542 ? DAG.getUNDEF(VT)
8543 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
8544 if (V1.getSimpleValueType() != VT)
8545 V1 = DAG.getBitcast(VT, V1);
8546 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
8547 }
8548
8549 // See if we can lower this build_vector to a INSERTPS.
8550 if (!Subtarget.hasSSE41())
8551 return SDValue();
8552
8553 SDValue V2 = Elt.getOperand(0);
8554 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
8555 V1 = SDValue();
8556
8557 bool CanFold = true;
8558 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
8559 if (Zeroable[i])
8560 continue;
8561
8562 SDValue Current = Op->getOperand(i);
8563 SDValue SrcVector = Current->getOperand(0);
8564 if (!V1.getNode())
8565 V1 = SrcVector;
8566 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
8567 }
8568
8569 if (!CanFold)
8570 return SDValue();
8571
8572 assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8572, __extension__ __PRETTY_FUNCTION__))
;
8573 if (V1.getSimpleValueType() != MVT::v4f32)
8574 V1 = DAG.getBitcast(MVT::v4f32, V1);
8575 if (V2.getSimpleValueType() != MVT::v4f32)
8576 V2 = DAG.getBitcast(MVT::v4f32, V2);
8577
8578 // Ok, we can emit an INSERTPS instruction.
8579 unsigned ZMask = Zeroable.to_ulong();
8580
8581 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8582 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8582, __extension__ __PRETTY_FUNCTION__))
;
8583 SDLoc DL(Op);
8584 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8585 DAG.getIntPtrConstant(InsertPSMask, DL, true));
8586 return DAG.getBitcast(VT, Result);
8587}
8588
8589/// Return a vector logical shift node.
8590static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8591 SelectionDAG &DAG, const TargetLowering &TLI,
8592 const SDLoc &dl) {
8593 assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8593, __extension__ __PRETTY_FUNCTION__))
;
8594 MVT ShVT = MVT::v16i8;
8595 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8596 SrcOp = DAG.getBitcast(ShVT, SrcOp);
8597 assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8597, __extension__ __PRETTY_FUNCTION__))
;
8598 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8599 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8600}
8601
8602static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8603 SelectionDAG &DAG) {
8604
8605 // Check if the scalar load can be widened into a vector load. And if
8606 // the address is "base + cst" see if the cst can be "absorbed" into
8607 // the shuffle mask.
8608 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8609 SDValue Ptr = LD->getBasePtr();
8610 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8611 return SDValue();
8612 EVT PVT = LD->getValueType(0);
8613 if (PVT != MVT::i32 && PVT != MVT::f32)
8614 return SDValue();
8615
8616 int FI = -1;
8617 int64_t Offset = 0;
8618 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8619 FI = FINode->getIndex();
8620 Offset = 0;
8621 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8622 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8623 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8624 Offset = Ptr.getConstantOperandVal(1);
8625 Ptr = Ptr.getOperand(0);
8626 } else {
8627 return SDValue();
8628 }
8629
8630 // FIXME: 256-bit vector instructions don't require a strict alignment,
8631 // improve this code to support it better.
8632 Align RequiredAlign(VT.getSizeInBits() / 8);
8633 SDValue Chain = LD->getChain();
8634 // Make sure the stack object alignment is at least 16 or 32.
8635 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8636 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
8637 if (!InferredAlign || *InferredAlign < RequiredAlign) {
8638 if (MFI.isFixedObjectIndex(FI)) {
8639 // Can't change the alignment. FIXME: It's possible to compute
8640 // the exact stack offset and reference FI + adjust offset instead.
8641 // If someone *really* cares about this. That's the way to implement it.
8642 return SDValue();
8643 } else {
8644 MFI.setObjectAlignment(FI, RequiredAlign);
8645 }
8646 }
8647
8648 // (Offset % 16 or 32) must be multiple of 4. Then address is then
8649 // Ptr + (Offset & ~15).
8650 if (Offset < 0)
8651 return SDValue();
8652 if ((Offset % RequiredAlign.value()) & 3)
8653 return SDValue();
8654 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
8655 if (StartOffset) {
8656 SDLoc DL(Ptr);
8657 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8658 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8659 }
8660
8661 int EltNo = (Offset - StartOffset) >> 2;
8662 unsigned NumElems = VT.getVectorNumElements();
8663
8664 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8665 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8666 LD->getPointerInfo().getWithOffset(StartOffset));
8667
8668 SmallVector<int, 8> Mask(NumElems, EltNo);
8669
8670 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8671 }
8672
8673 return SDValue();
8674}
8675
8676// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8677static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8678 if (ISD::isNON_EXTLoad(Elt.getNode())) {
8679 auto *BaseLd = cast<LoadSDNode>(Elt);
8680 if (!BaseLd->isSimple())
8681 return false;
8682 Ld = BaseLd;
8683 ByteOffset = 0;
8684 return true;
8685 }
8686
8687 switch (Elt.getOpcode()) {
8688 case ISD::BITCAST:
8689 case ISD::TRUNCATE:
8690 case ISD::SCALAR_TO_VECTOR:
8691 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8692 case ISD::SRL:
8693 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8694 uint64_t Idx = IdxC->getZExtValue();
8695 if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8696 ByteOffset += Idx / 8;
8697 return true;
8698 }
8699 }
8700 break;
8701 case ISD::EXTRACT_VECTOR_ELT:
8702 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
8703 SDValue Src = Elt.getOperand(0);
8704 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8705 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8706 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8707 findEltLoadSrc(Src, Ld, ByteOffset)) {
8708 uint64_t Idx = IdxC->getZExtValue();
8709 ByteOffset += Idx * (SrcSizeInBits / 8);
8710 return true;
8711 }
8712 }
8713 break;
8714 }
8715
8716 return false;
8717}
8718
8719/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8720/// elements can be replaced by a single large load which has the same value as
8721/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8722///
8723/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8724static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8725 const SDLoc &DL, SelectionDAG &DAG,
8726 const X86Subtarget &Subtarget,
8727 bool IsAfterLegalize) {
8728 if ((VT.getScalarSizeInBits() % 8) != 0)
8729 return SDValue();
8730
8731 unsigned NumElems = Elts.size();
8732
8733 int LastLoadedElt = -1;
8734 APInt LoadMask = APInt::getNullValue(NumElems);
8735 APInt ZeroMask = APInt::getNullValue(NumElems);
8736 APInt UndefMask = APInt::getNullValue(NumElems);
8737
8738 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8739 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8740
8741 // For each element in the initializer, see if we've found a load, zero or an
8742 // undef.
8743 for (unsigned i = 0; i < NumElems; ++i) {
8744 SDValue Elt = peekThroughBitcasts(Elts[i]);
8745 if (!Elt.getNode())
8746 return SDValue();
8747 if (Elt.isUndef()) {
8748 UndefMask.setBit(i);
8749 continue;
8750 }
8751 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8752 ZeroMask.setBit(i);
8753 continue;
8754 }
8755
8756 // Each loaded element must be the correct fractional portion of the
8757 // requested vector load.
8758 unsigned EltSizeInBits = Elt.getValueSizeInBits();
8759 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8760 return SDValue();
8761
8762 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8763 return SDValue();
8764 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8765 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8766 return SDValue();
8767
8768 LoadMask.setBit(i);
8769 LastLoadedElt = i;
8770 }
8771 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8773, __extension__ __PRETTY_FUNCTION__))
8772 LoadMask.countPopulation()) == NumElems &&(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8773, __extension__ __PRETTY_FUNCTION__))
8773 "Incomplete element masks")(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8773, __extension__ __PRETTY_FUNCTION__))
;
8774
8775 // Handle Special Cases - all undef or undef/zero.
8776 if (UndefMask.countPopulation() == NumElems)
8777 return DAG.getUNDEF(VT);
8778 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8779 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8780 : DAG.getConstantFP(0.0, DL, VT);
8781
8782 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8783 int FirstLoadedElt = LoadMask.countTrailingZeros();
8784 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8785 EVT EltBaseVT = EltBase.getValueType();
8786 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8787, __extension__ __PRETTY_FUNCTION__))
8787 "Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8787, __extension__ __PRETTY_FUNCTION__))
;
8788 LoadSDNode *LDBase = Loads[FirstLoadedElt];
8789 assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8789, __extension__ __PRETTY_FUNCTION__))
;
8790 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8791 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8792 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
8793 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
8794 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8794, __extension__ __PRETTY_FUNCTION__))
;
8795
8796 // TODO: Support offsetting the base load.
8797 if (ByteOffsets[FirstLoadedElt] != 0)
8798 return SDValue();
8799
8800 // Check to see if the element's load is consecutive to the base load
8801 // or offset from a previous (already checked) load.
8802 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8803 LoadSDNode *Ld = Loads[EltIdx];
8804 int64_t ByteOffset = ByteOffsets[EltIdx];
8805 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8806 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8807 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8808 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8809 }
8810 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8811 EltIdx - FirstLoadedElt);
8812 };
8813
8814 // Consecutive loads can contain UNDEFS but not ZERO elements.
8815 // Consecutive loads with UNDEFs and ZEROs elements require a
8816 // an additional shuffle stage to clear the ZERO elements.
8817 bool IsConsecutiveLoad = true;
8818 bool IsConsecutiveLoadWithZeros = true;
8819 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8820 if (LoadMask[i]) {
8821 if (!CheckConsecutiveLoad(LDBase, i)) {
8822 IsConsecutiveLoad = false;
8823 IsConsecutiveLoadWithZeros = false;
8824 break;
8825 }
8826 } else if (ZeroMask[i]) {
8827 IsConsecutiveLoad = false;
8828 }
8829 }
8830
8831 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8832 auto MMOFlags = LDBase->getMemOperand()->getFlags();
8833 assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8834, __extension__ __PRETTY_FUNCTION__))
8834 "Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8834, __extension__ __PRETTY_FUNCTION__))
;
8835 SDValue NewLd =
8836 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8837 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
8838 MMOFlags);
8839 for (auto *LD : Loads)
8840 if (LD)
8841 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8842 return NewLd;
8843 };
8844
8845 // Check if the base load is entirely dereferenceable.
8846 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8847 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8848
8849 // LOAD - all consecutive load/undefs (must start/end with a load or be
8850 // entirely dereferenceable). If we have found an entire vector of loads and
8851 // undefs, then return a large load of the entire vector width starting at the
8852 // base pointer. If the vector contains zeros, then attempt to shuffle those
8853 // elements.
8854 if (FirstLoadedElt == 0 &&
8855 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
8856 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8857 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8858 return SDValue();
8859
8860 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8861 // will lower to regular temporal loads and use the cache.
8862 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8863 VT.is256BitVector() && !Subtarget.hasInt256())
8864 return SDValue();
8865
8866 if (NumElems == 1)
8867 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8868
8869 if (!ZeroMask)
8870 return CreateLoad(VT, LDBase);
8871
8872 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8873 // vector and a zero vector to clear out the zero elements.
8874 if (!IsAfterLegalize && VT.isVector()) {
8875 unsigned NumMaskElts = VT.getVectorNumElements();
8876 if ((NumMaskElts % NumElems) == 0) {
8877 unsigned Scale = NumMaskElts / NumElems;
8878 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8879 for (unsigned i = 0; i < NumElems; ++i) {
8880 if (UndefMask[i])
8881 continue;
8882 int Offset = ZeroMask[i] ? NumMaskElts : 0;
8883 for (unsigned j = 0; j != Scale; ++j)
8884 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8885 }
8886 SDValue V = CreateLoad(VT, LDBase);
8887 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8888 : DAG.getConstantFP(0.0, DL, VT);
8889 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8890 }
8891 }
8892 }
8893
8894 // If the upper half of a ymm/zmm load is undef then just load the lower half.
8895 if (VT.is256BitVector() || VT.is512BitVector()) {
8896 unsigned HalfNumElems = NumElems / 2;
8897 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8898 EVT HalfVT =
8899 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8900 SDValue HalfLD =
8901 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8902 DAG, Subtarget, IsAfterLegalize);
8903 if (HalfLD)
8904 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8905 HalfLD, DAG.getIntPtrConstant(0, DL));
8906 }
8907 }
8908
8909 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8910 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8911 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
8912 LoadSizeInBits == 64) &&
8913 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8914 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8915 : MVT::getIntegerVT(LoadSizeInBits);
8916 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8917 // Allow v4f32 on SSE1 only targets.
8918 // FIXME: Add more isel patterns so we can just use VT directly.
8919 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8920 VecVT = MVT::v4f32;
8921 if (TLI.isTypeLegal(VecVT)) {
8922 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8923 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8924 SDValue ResNode = DAG.getMemIntrinsicNode(
8925 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
8926 LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
8927 for (auto *LD : Loads)
8928 if (LD)
8929 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8930 return DAG.getBitcast(VT, ResNode);
8931 }
8932 }
8933
8934 // BROADCAST - match the smallest possible repetition pattern, load that
8935 // scalar/subvector element and then broadcast to the entire vector.
8936 if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8937 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8938 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8939 unsigned RepeatSize = SubElems * BaseSizeInBits;
8940 unsigned ScalarSize = std::min(RepeatSize, 64u);
8941 if (!Subtarget.hasAVX2() && ScalarSize < 32)
8942 continue;
8943
8944 // Don't attempt a 1:N subvector broadcast - it should be caught by
8945 // combineConcatVectorOps, else will cause infinite loops.
8946 if (RepeatSize > ScalarSize && SubElems == 1)
8947 continue;
8948
8949 bool Match = true;
8950 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8951 for (unsigned i = 0; i != NumElems && Match; ++i) {
8952 if (!LoadMask[i])
8953 continue;
8954 SDValue Elt = peekThroughBitcasts(Elts[i]);
8955 if (RepeatedLoads[i % SubElems].isUndef())
8956 RepeatedLoads[i % SubElems] = Elt;
8957 else
8958 Match &= (RepeatedLoads[i % SubElems] == Elt);
8959 }
8960
8961 // We must have loads at both ends of the repetition.
8962 Match &= !RepeatedLoads.front().isUndef();
8963 Match &= !RepeatedLoads.back().isUndef();
8964 if (!Match)
8965 continue;
8966
8967 EVT RepeatVT =
8968 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8969 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8970 : EVT::getFloatingPointVT(ScalarSize);
8971 if (RepeatSize > ScalarSize)
8972 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8973 RepeatSize / ScalarSize);
8974 EVT BroadcastVT =
8975 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8976 VT.getSizeInBits() / ScalarSize);
8977 if (TLI.isTypeLegal(BroadcastVT)) {
8978 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8979 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
8980 SDValue Broadcast = RepeatLoad;
8981 if (RepeatSize > ScalarSize) {
8982 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
8983 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
8984 } else {
8985 if (!Subtarget.hasAVX2() &&
8986 !MayFoldLoadIntoBroadcastFromMem(
8987 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
8988 /*AssumeSingleUse=*/true))
8989 return SDValue();
8990 Broadcast =
8991 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
8992 }
8993 return DAG.getBitcast(VT, Broadcast);
8994 }
8995 }
8996 }
8997 }
8998
8999 return SDValue();
9000}
9001
9002// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
9003// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
9004// are consecutive, non-overlapping, and in the right order.
9005static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
9006 SelectionDAG &DAG,
9007 const X86Subtarget &Subtarget,
9008 bool IsAfterLegalize) {
9009 SmallVector<SDValue, 64> Elts;
9010 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
9011 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
9012 Elts.push_back(Elt);
9013 continue;
9014 }
9015 return SDValue();
9016 }
9017 assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9017, __extension__ __PRETTY_FUNCTION__))
;
9018 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
9019 IsAfterLegalize);
9020}
9021
9022static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
9023 unsigned SplatBitSize, LLVMContext &C) {
9024 unsigned ScalarSize = VT.getScalarSizeInBits();
9025 unsigned NumElm = SplatBitSize / ScalarSize;
9026
9027 SmallVector<Constant *, 32> ConstantVec;
9028 for (unsigned i = 0; i < NumElm; i++) {
9029 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
9030 Constant *Const;
9031 if (VT.isFloatingPoint()) {
9032 if (ScalarSize == 16) {
9033 Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
9034 } else if (ScalarSize == 32) {
9035 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
9036 } else {
9037 assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9037, __extension__ __PRETTY_FUNCTION__))
;
9038 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
9039 }
9040 } else
9041 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
9042 ConstantVec.push_back(Const);
9043 }
9044 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
9045}
9046
9047static bool isFoldableUseOfShuffle(SDNode *N) {
9048 for (auto *U : N->uses()) {
9049 unsigned Opc = U->getOpcode();
9050 // VPERMV/VPERMV3 shuffles can never fold their index operands.
9051 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
9052 return false;
9053 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
9054 return false;
9055 if (isTargetShuffle(Opc))
9056 return true;
9057 if (Opc == ISD::BITCAST) // Ignore bitcasts
9058 return isFoldableUseOfShuffle(U);
9059 if (N->hasOneUse())
9060 return true;
9061 }
9062 return false;
9063}
9064
9065/// Attempt to use the vbroadcast instruction to generate a splat value
9066/// from a splat BUILD_VECTOR which uses:
9067/// a. A single scalar load, or a constant.
9068/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
9069///
9070/// The VBROADCAST node is returned when a pattern is found,
9071/// or SDValue() otherwise.
9072static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
9073 const X86Subtarget &Subtarget,
9074 SelectionDAG &DAG) {
9075 // VBROADCAST requires AVX.
9076 // TODO: Splats could be generated for non-AVX CPUs using SSE
9077 // instructions, but there's less potential gain for only 128-bit vectors.
9078 if (!Subtarget.hasAVX())
9079 return SDValue();
9080
9081 MVT VT = BVOp->getSimpleValueType(0);
9082 unsigned NumElts = VT.getVectorNumElements();
9083 SDLoc dl(BVOp);
9084
9085 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9086, __extension__ __PRETTY_FUNCTION__))
9086 "Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9086, __extension__ __PRETTY_FUNCTION__))
;
9087
9088 // See if the build vector is a repeating sequence of scalars (inc. splat).
9089 SDValue Ld;
9090 BitVector UndefElements;
9091 SmallVector<SDValue, 16> Sequence;
9092 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
9093 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9093, __extension__ __PRETTY_FUNCTION__))
;
9094 if (Sequence.size() == 1)
9095 Ld = Sequence[0];
9096 }
9097
9098 // Attempt to use VBROADCASTM
9099 // From this pattern:
9100 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
9101 // b. t1 = (build_vector t0 t0)
9102 //
9103 // Create (VBROADCASTM v2i1 X)
9104 if (!Sequence.empty() && Subtarget.hasCDI()) {
9105 // If not a splat, are the upper sequence values zeroable?
9106 unsigned SeqLen = Sequence.size();
9107 bool UpperZeroOrUndef =
9108 SeqLen == 1 ||
9109 llvm::all_of(makeArrayRef(Sequence).drop_front(), [](SDValue V) {
9110 return !V || V.isUndef() || isNullConstant(V);
9111 });
9112 SDValue Op0 = Sequence[0];
9113 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
9114 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
9115 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
9116 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
9117 ? Op0.getOperand(0)
9118 : Op0.getOperand(0).getOperand(0);
9119 MVT MaskVT = BOperand.getSimpleValueType();
9120 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
9121 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
9122 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
9123 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
9124 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
9125 unsigned Scale = 512 / VT.getSizeInBits();
9126 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
9127 }
9128 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
9129 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
9130 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
9131 return DAG.getBitcast(VT, Bcst);
9132 }
9133 }
9134 }
9135
9136 unsigned NumUndefElts = UndefElements.count();
9137 if (!Ld || (NumElts - NumUndefElts) <= 1) {
9138 APInt SplatValue, Undef;
9139 unsigned SplatBitSize;
9140 bool HasUndef;
9141 // Check if this is a repeated constant pattern suitable for broadcasting.
9142 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
9143 SplatBitSize > VT.getScalarSizeInBits() &&
9144 SplatBitSize < VT.getSizeInBits()) {
9145 // Avoid replacing with broadcast when it's a use of a shuffle
9146 // instruction to preserve the present custom lowering of shuffles.
9147 if (isFoldableUseOfShuffle(BVOp))
9148 return SDValue();
9149 // replace BUILD_VECTOR with broadcast of the repeated constants.
9150 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9151 LLVMContext *Ctx = DAG.getContext();
9152 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
9153 if (Subtarget.hasAVX()) {
9154 if (SplatBitSize == 32 || SplatBitSize == 64 ||
9155 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
9156 // Splatted value can fit in one INTEGER constant in constant pool.
9157 // Load the constant and broadcast it.
9158 MVT CVT = MVT::getIntegerVT(SplatBitSize);
9159 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
9160 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
9161 SDValue CP = DAG.getConstantPool(C, PVT);
9162 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
9163
9164 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9165 SDVTList Tys =
9166 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
9167 SDValue Ops[] = {DAG.getEntryNode(), CP};
9168 MachinePointerInfo MPI =
9169 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9170 SDValue Brdcst = DAG.getMemIntrinsicNode(
9171 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
9172 MachineMemOperand::MOLoad);
9173 return DAG.getBitcast(VT, Brdcst);
9174 }
9175 if (SplatBitSize > 64) {
9176 // Load the vector of constants and broadcast it.
9177 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
9178 *Ctx);
9179 SDValue VCP = DAG.getConstantPool(VecC, PVT);
9180 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
9181 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
9182 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
9183 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9184 SDValue Ops[] = {DAG.getEntryNode(), VCP};
9185 MachinePointerInfo MPI =
9186 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9187 return DAG.getMemIntrinsicNode(
9188 X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,
9189 MachineMemOperand::MOLoad);
9190 }
9191 }
9192 }
9193
9194 // If we are moving a scalar into a vector (Ld must be set and all elements
9195 // but 1 are undef) and that operation is not obviously supported by
9196 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
9197 // That's better than general shuffling and may eliminate a load to GPR and
9198 // move from scalar to vector register.
9199 if (!Ld || NumElts - NumUndefElts != 1)
9200 return SDValue();
9201 unsigned ScalarSize = Ld.getValueSizeInBits();
9202 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
9203 return SDValue();
9204 }
9205
9206 bool ConstSplatVal =
9207 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
9208 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
9209
9210 // TODO: Handle broadcasts of non-constant sequences.
9211
9212 // Make sure that all of the users of a non-constant load are from the
9213 // BUILD_VECTOR node.
9214 // FIXME: Is the use count needed for non-constant, non-load case?
9215 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
9216 return SDValue();
9217
9218 unsigned ScalarSize = Ld.getValueSizeInBits();
9219 bool IsGE256 = (VT.getSizeInBits() >= 256);
9220
9221 // When optimizing for size, generate up to 5 extra bytes for a broadcast
9222 // instruction to save 8 or more bytes of constant pool data.
9223 // TODO: If multiple splats are generated to load the same constant,
9224 // it may be detrimental to overall size. There needs to be a way to detect
9225 // that condition to know if this is truly a size win.
9226 bool OptForSize = DAG.shouldOptForSize();
9227
9228 // Handle broadcasting a single constant scalar from the constant pool
9229 // into a vector.
9230 // On Sandybridge (no AVX2), it is still better to load a constant vector
9231 // from the constant pool and not to broadcast it from a scalar.
9232 // But override that restriction when optimizing for size.
9233 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
9234 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
9235 EVT CVT = Ld.getValueType();
9236 assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9236, __extension__ __PRETTY_FUNCTION__))
;
9237
9238 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
9239 // For size optimization, also splat v2f64 and v2i64, and for size opt
9240 // with AVX2, also splat i8 and i16.
9241 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
9242 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9243 (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
9244 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
9245 const Constant *C = nullptr;
9246 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
9247 C = CI->getConstantIntValue();
9248 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
9249 C = CF->getConstantFPValue();
9250
9251 assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9251, __extension__ __PRETTY_FUNCTION__))
;
9252
9253 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9254 SDValue CP =
9255 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
9256 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
9257
9258 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9259 SDValue Ops[] = {DAG.getEntryNode(), CP};
9260 MachinePointerInfo MPI =
9261 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
9262 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
9263 MPI, Alignment, MachineMemOperand::MOLoad);
9264 }
9265 }
9266
9267 // Handle AVX2 in-register broadcasts.
9268 if (!IsLoad && Subtarget.hasInt256() &&
9269 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
9270 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9271
9272 // The scalar source must be a normal load.
9273 if (!IsLoad)
9274 return SDValue();
9275
9276 // Make sure the non-chain result is only used by this build vector.
9277 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
9278 return SDValue();
9279
9280 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
9281 (Subtarget.hasVLX() && ScalarSize == 64)) {
9282 auto *LN = cast<LoadSDNode>(Ld);
9283 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9284 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9285 SDValue BCast =
9286 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9287 LN->getMemoryVT(), LN->getMemOperand());
9288 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9289 return BCast;
9290 }
9291
9292 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
9293 // double since there is no vbroadcastsd xmm
9294 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
9295 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
9296 auto *LN = cast<LoadSDNode>(Ld);
9297 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
9298 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
9299 SDValue BCast =
9300 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
9301 LN->getMemoryVT(), LN->getMemOperand());
9302 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
9303 return BCast;
9304 }
9305
9306 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
9307 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
9308
9309 // Unsupported broadcast.
9310 return SDValue();
9311}
9312
9313/// For an EXTRACT_VECTOR_ELT with a constant index return the real
9314/// underlying vector and index.
9315///
9316/// Modifies \p ExtractedFromVec to the real vector and returns the real
9317/// index.
9318static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
9319 SDValue ExtIdx) {
9320 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
9321 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
9322 return Idx;
9323
9324 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
9325 // lowered this:
9326 // (extract_vector_elt (v8f32 %1), Constant<6>)
9327 // to:
9328 // (extract_vector_elt (vector_shuffle<2,u,u,u>
9329 // (extract_subvector (v8f32 %0), Constant<4>),
9330 // undef)
9331 // Constant<0>)
9332 // In this case the vector is the extract_subvector expression and the index
9333 // is 2, as specified by the shuffle.
9334 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
9335 SDValue ShuffleVec = SVOp->getOperand(0);
9336 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
9337 assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9338, __extension__ __PRETTY_FUNCTION__))
9338 ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9338, __extension__ __PRETTY_FUNCTION__))
;
9339
9340 int ShuffleIdx = SVOp->getMaskElt(Idx);
9341 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
9342 ExtractedFromVec = ShuffleVec;
9343 return ShuffleIdx;
9344 }
9345 return Idx;
9346}
9347
9348static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
9349 MVT VT = Op.getSimpleValueType();
9350
9351 // Skip if insert_vec_elt is not supported.
9352 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9353 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
9354 return SDValue();
9355
9356 SDLoc DL(Op);
9357 unsigned NumElems = Op.getNumOperands();
9358
9359 SDValue VecIn1;
9360 SDValue VecIn2;
9361 SmallVector<unsigned, 4> InsertIndices;
9362 SmallVector<int, 8> Mask(NumElems, -1);
9363
9364 for (unsigned i = 0; i != NumElems; ++i) {
9365 unsigned Opc = Op.getOperand(i).getOpcode();
9366
9367 if (Opc == ISD::UNDEF)
9368 continue;
9369
9370 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
9371 // Quit if more than 1 elements need inserting.
9372 if (InsertIndices.size() > 1)
9373 return SDValue();
9374
9375 InsertIndices.push_back(i);
9376 continue;
9377 }
9378
9379 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
9380 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
9381
9382 // Quit if non-constant index.
9383 if (!isa<ConstantSDNode>(ExtIdx))
9384 return SDValue();
9385 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
9386
9387 // Quit if extracted from vector of different type.
9388 if (ExtractedFromVec.getValueType() != VT)
9389 return SDValue();
9390
9391 if (!VecIn1.getNode())
9392 VecIn1 = ExtractedFromVec;
9393 else if (VecIn1 != ExtractedFromVec) {
9394 if (!VecIn2.getNode())
9395 VecIn2 = ExtractedFromVec;
9396 else if (VecIn2 != ExtractedFromVec)
9397 // Quit if more than 2 vectors to shuffle
9398 return SDValue();
9399 }
9400
9401 if (ExtractedFromVec == VecIn1)
9402 Mask[i] = Idx;
9403 else if (ExtractedFromVec == VecIn2)
9404 Mask[i] = Idx + NumElems;
9405 }
9406
9407 if (!VecIn1.getNode())
9408 return SDValue();
9409
9410 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
9411 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
9412
9413 for (unsigned Idx : InsertIndices)
9414 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
9415 DAG.getIntPtrConstant(Idx, DL));
9416
9417 return NV;
9418}
9419
9420// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
9421static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
9422 const X86Subtarget &Subtarget) {
9423
9424 MVT VT = Op.getSimpleValueType();
9425 assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9426, __extension__ __PRETTY_FUNCTION__))
9426 "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9426, __extension__ __PRETTY_FUNCTION__))
;
9427
9428 SDLoc dl(Op);
9429 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
9430 ISD::isBuildVectorAllOnes(Op.getNode()))
9431 return Op;
9432
9433 uint64_t Immediate = 0;
9434 SmallVector<unsigned, 16> NonConstIdx;
9435 bool IsSplat = true;
9436 bool HasConstElts = false;
9437 int SplatIdx = -1;
9438 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
9439 SDValue In = Op.getOperand(idx);
9440 if (In.isUndef())
9441 continue;
9442 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
9443 Immediate |= (InC->getZExtValue() & 0x1) << idx;
9444 HasConstElts = true;
9445 } else {
9446 NonConstIdx.push_back(idx);
9447 }
9448 if (SplatIdx < 0)
9449 SplatIdx = idx;
9450 else if (In != Op.getOperand(SplatIdx))
9451 IsSplat = false;
9452 }
9453
9454 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
9455 if (IsSplat) {
9456 // The build_vector allows the scalar element to be larger than the vector
9457 // element type. We need to mask it to use as a condition unless we know
9458 // the upper bits are zero.
9459 // FIXME: Use computeKnownBits instead of checking specific opcode?
9460 SDValue Cond = Op.getOperand(SplatIdx);
9461 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9461, __extension__ __PRETTY_FUNCTION__))
;
9462 if (Cond.getOpcode() != ISD::SETCC)
9463 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
9464 DAG.getConstant(1, dl, MVT::i8));
9465
9466 // Perform the select in the scalar domain so we can use cmov.
9467 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9468 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
9469 DAG.getAllOnesConstant(dl, MVT::i32),
9470 DAG.getConstant(0, dl, MVT::i32));
9471 Select = DAG.getBitcast(MVT::v32i1, Select);
9472 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
9473 } else {
9474 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9475 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
9476 DAG.getAllOnesConstant(dl, ImmVT),
9477 DAG.getConstant(0, dl, ImmVT));
9478 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9479 Select = DAG.getBitcast(VecVT, Select);
9480 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
9481 DAG.getIntPtrConstant(0, dl));
9482 }
9483 }
9484
9485 // insert elements one by one
9486 SDValue DstVec;
9487 if (HasConstElts) {
9488 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
9489 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
9490 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
9491 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
9492 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
9493 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
9494 } else {
9495 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
9496 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
9497 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
9498 DstVec = DAG.getBitcast(VecVT, Imm);
9499 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
9500 DAG.getIntPtrConstant(0, dl));
9501 }
9502 } else
9503 DstVec = DAG.getUNDEF(VT);
9504
9505 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
9506 unsigned InsertIdx = NonConstIdx[i];
9507 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
9508 Op.getOperand(InsertIdx),
9509 DAG.getIntPtrConstant(InsertIdx, dl));
9510 }
9511 return DstVec;
9512}
9513
9514LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {
9515 switch (Opcode) {
9516 case X86ISD::PACKSS:
9517 case X86ISD::PACKUS:
9518 case X86ISD::FHADD:
9519 case X86ISD::FHSUB:
9520 case X86ISD::HADD:
9521 case X86ISD::HSUB:
9522 return true;
9523 }
9524 return false;
9525}
9526
9527/// This is a helper function of LowerToHorizontalOp().
9528/// This function checks that the build_vector \p N in input implements a
9529/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
9530/// may not match the layout of an x86 256-bit horizontal instruction.
9531/// In other words, if this returns true, then some extraction/insertion will
9532/// be required to produce a valid horizontal instruction.
9533///
9534/// Parameter \p Opcode defines the kind of horizontal operation to match.
9535/// For example, if \p Opcode is equal to ISD::ADD, then this function
9536/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
9537/// is equal to ISD::SUB, then this function checks if this is a horizontal
9538/// arithmetic sub.
9539///
9540/// This function only analyzes elements of \p N whose indices are
9541/// in range [BaseIdx, LastIdx).
9542///
9543/// TODO: This function was originally used to match both real and fake partial
9544/// horizontal operations, but the index-matching logic is incorrect for that.
9545/// See the corrected implementation in isHopBuildVector(). Can we reduce this
9546/// code because it is only used for partial h-op matching now?
9547static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
9548 SelectionDAG &DAG,
9549 unsigned BaseIdx, unsigned LastIdx,
9550 SDValue &V0, SDValue &V1) {
9551 EVT VT = N->getValueType(0);
9552 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9552, __extension__ __PRETTY_FUNCTION__))
;
9553 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9553, __extension__ __PRETTY_FUNCTION__))
;
9554 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9555, __extension__ __PRETTY_FUNCTION__))
9555 "Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9555, __extension__ __PRETTY_FUNCTION__))
;
9556
9557 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
9558 bool CanFold = true;
9559 unsigned ExpectedVExtractIdx = BaseIdx;
9560 unsigned NumElts = LastIdx - BaseIdx;
9561 V0 = DAG.getUNDEF(VT);
9562 V1 = DAG.getUNDEF(VT);
9563
9564 // Check if N implements a horizontal binop.
9565 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
9566 SDValue Op = N->getOperand(i + BaseIdx);
9567
9568 // Skip UNDEFs.
9569 if (Op->isUndef()) {
9570 // Update the expected vector extract index.
9571 if (i * 2 == NumElts)
9572 ExpectedVExtractIdx = BaseIdx;
9573 ExpectedVExtractIdx += 2;
9574 continue;
9575 }
9576
9577 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
9578
9579 if (!CanFold)
9580 break;
9581
9582 SDValue Op0 = Op.getOperand(0);
9583 SDValue Op1 = Op.getOperand(1);
9584
9585 // Try to match the following pattern:
9586 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
9587 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9588 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
9589 Op0.getOperand(0) == Op1.getOperand(0) &&
9590 isa<ConstantSDNode>(Op0.getOperand(1)) &&
9591 isa<ConstantSDNode>(Op1.getOperand(1)));
9592 if (!CanFold)
9593 break;
9594
9595 unsigned I0 = Op0.getConstantOperandVal(1);
9596 unsigned I1 = Op1.getConstantOperandVal(1);
9597
9598 if (i * 2 < NumElts) {
9599 if (V0.isUndef()) {
9600 V0 = Op0.getOperand(0);
9601 if (V0.getValueType() != VT)
9602 return false;
9603 }
9604 } else {
9605 if (V1.isUndef()) {
9606 V1 = Op0.getOperand(0);
9607 if (V1.getValueType() != VT)
9608 return false;
9609 }
9610 if (i * 2 == NumElts)
9611 ExpectedVExtractIdx = BaseIdx;
9612 }
9613
9614 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9615 if (I0 == ExpectedVExtractIdx)
9616 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9617 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9618 // Try to match the following dag sequence:
9619 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9620 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9621 } else
9622 CanFold = false;
9623
9624 ExpectedVExtractIdx += 2;
9625 }
9626
9627 return CanFold;
9628}
9629
9630/// Emit a sequence of two 128-bit horizontal add/sub followed by
9631/// a concat_vector.
9632///
9633/// This is a helper function of LowerToHorizontalOp().
9634/// This function expects two 256-bit vectors called V0 and V1.
9635/// At first, each vector is split into two separate 128-bit vectors.
9636/// Then, the resulting 128-bit vectors are used to implement two
9637/// horizontal binary operations.
9638///
9639/// The kind of horizontal binary operation is defined by \p X86Opcode.
9640///
9641/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9642/// the two new horizontal binop.
9643/// When Mode is set, the first horizontal binop dag node would take as input
9644/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9645/// horizontal binop dag node would take as input the lower 128-bit of V1
9646/// and the upper 128-bit of V1.
9647/// Example:
9648/// HADD V0_LO, V0_HI
9649/// HADD V1_LO, V1_HI
9650///
9651/// Otherwise, the first horizontal binop dag node takes as input the lower
9652/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9653/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9654/// Example:
9655/// HADD V0_LO, V1_LO
9656/// HADD V0_HI, V1_HI
9657///
9658/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9659/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9660/// the upper 128-bits of the result.
9661static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9662 const SDLoc &DL, SelectionDAG &DAG,
9663 unsigned X86Opcode, bool Mode,
9664 bool isUndefLO, bool isUndefHI) {
9665 MVT VT = V0.getSimpleValueType();
9666 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9667, __extension__ __PRETTY_FUNCTION__))
9667 "Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9667, __extension__ __PRETTY_FUNCTION__))
;
9668
9669 unsigned NumElts = VT.getVectorNumElements();
9670 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9671 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9672 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9673 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9674 MVT NewVT = V0_LO.getSimpleValueType();
9675
9676 SDValue LO = DAG.getUNDEF(NewVT);
9677 SDValue HI = DAG.getUNDEF(NewVT);
9678
9679 if (Mode) {
9680 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9681 if (!isUndefLO && !V0->isUndef())
9682 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9683 if (!isUndefHI && !V1->isUndef())
9684 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9685 } else {
9686 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9687 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9688 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9689
9690 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9691 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9692 }
9693
9694 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9695}
9696
9697/// Returns true iff \p BV builds a vector with the result equivalent to
9698/// the result of ADDSUB/SUBADD operation.
9699/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9700/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9701/// \p Opnd0 and \p Opnd1.
9702static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9703 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9704 SDValue &Opnd0, SDValue &Opnd1,
9705 unsigned &NumExtracts,
9706 bool &IsSubAdd) {
9707
9708 MVT VT = BV->getSimpleValueType(0);
9709 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9710 return false;
9711
9712 unsigned NumElts = VT.getVectorNumElements();
9713 SDValue InVec0 = DAG.getUNDEF(VT);
9714 SDValue InVec1 = DAG.getUNDEF(VT);
9715
9716 NumExtracts = 0;
9717
9718 // Odd-numbered elements in the input build vector are obtained from
9719 // adding/subtracting two integer/float elements.
9720 // Even-numbered elements in the input build vector are obtained from
9721 // subtracting/adding two integer/float elements.
9722 unsigned Opc[2] = {0, 0};
9723 for (unsigned i = 0, e = NumElts; i != e; ++i) {
9724 SDValue Op = BV->getOperand(i);
9725
9726 // Skip 'undef' values.
9727 unsigned Opcode = Op.getOpcode();
9728 if (Opcode == ISD::UNDEF)
9729 continue;
9730
9731 // Early exit if we found an unexpected opcode.
9732 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9733 return false;
9734
9735 SDValue Op0 = Op.getOperand(0);
9736 SDValue Op1 = Op.getOperand(1);
9737
9738 // Try to match the following pattern:
9739 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9740 // Early exit if we cannot match that sequence.
9741 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9742 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9743 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9744 Op0.getOperand(1) != Op1.getOperand(1))
9745 return false;
9746
9747 unsigned I0 = Op0.getConstantOperandVal(1);
9748 if (I0 != i)
9749 return false;
9750
9751 // We found a valid add/sub node, make sure its the same opcode as previous
9752 // elements for this parity.
9753 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9754 return false;
9755 Opc[i % 2] = Opcode;
9756
9757 // Update InVec0 and InVec1.
9758 if (InVec0.isUndef()) {
9759 InVec0 = Op0.getOperand(0);
9760 if (InVec0.getSimpleValueType() != VT)
9761 return false;
9762 }
9763 if (InVec1.isUndef()) {
9764 InVec1 = Op1.getOperand(0);
9765 if (InVec1.getSimpleValueType() != VT)
9766 return false;
9767 }
9768
9769 // Make sure that operands in input to each add/sub node always
9770 // come from a same pair of vectors.
9771 if (InVec0 != Op0.getOperand(0)) {
9772 if (Opcode == ISD::FSUB)
9773 return false;
9774
9775 // FADD is commutable. Try to commute the operands
9776 // and then test again.
9777 std::swap(Op0, Op1);
9778 if (InVec0 != Op0.getOperand(0))
9779 return false;
9780 }
9781
9782 if (InVec1 != Op1.getOperand(0))
9783 return false;
9784
9785 // Increment the number of extractions done.
9786 ++NumExtracts;
9787 }
9788
9789 // Ensure we have found an opcode for both parities and that they are
9790 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9791 // inputs are undef.
9792 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9793 InVec0.isUndef() || InVec1.isUndef())
9794 return false;
9795
9796 IsSubAdd = Opc[0] == ISD::FADD;
9797
9798 Opnd0 = InVec0;
9799 Opnd1 = InVec1;
9800 return true;
9801}
9802
9803/// Returns true if is possible to fold MUL and an idiom that has already been
9804/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9805/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9806/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9807///
9808/// Prior to calling this function it should be known that there is some
9809/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9810/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9811/// before replacement of such SDNode with ADDSUB operation. Thus the number
9812/// of \p Opnd0 uses is expected to be equal to 2.
9813/// For example, this function may be called for the following IR:
9814/// %AB = fmul fast <2 x double> %A, %B
9815/// %Sub = fsub fast <2 x double> %AB, %C
9816/// %Add = fadd fast <2 x double> %AB, %C
9817/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9818/// <2 x i32> <i32 0, i32 3>
9819/// There is a def for %Addsub here, which potentially can be replaced by
9820/// X86ISD::ADDSUB operation:
9821/// %Addsub = X86ISD::ADDSUB %AB, %C
9822/// and such ADDSUB can further be replaced with FMADDSUB:
9823/// %Addsub = FMADDSUB %A, %B, %C.
9824///
9825/// The main reason why this method is called before the replacement of the
9826/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9827/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9828/// FMADDSUB is.
9829static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9830 SelectionDAG &DAG,
9831 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9832 unsigned ExpectedUses) {
9833 if (Opnd0.getOpcode() != ISD::FMUL ||
9834 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9835 return false;
9836
9837 // FIXME: These checks must match the similar ones in
9838 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9839 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9840 // or MUL + ADDSUB to FMADDSUB.
9841 const TargetOptions &Options = DAG.getTarget().Options;
9842 bool AllowFusion =
9843 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9844 if (!AllowFusion)
9845 return false;
9846
9847 Opnd2 = Opnd1;
9848 Opnd1 = Opnd0.getOperand(1);
9849 Opnd0 = Opnd0.getOperand(0);
9850
9851 return true;
9852}
9853
9854/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9855/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9856/// X86ISD::FMSUBADD node.
9857static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9858 const X86Subtarget &Subtarget,
9859 SelectionDAG &DAG) {
9860 SDValue Opnd0, Opnd1;
9861 unsigned NumExtracts;
9862 bool IsSubAdd;
9863 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9864 IsSubAdd))
9865 return SDValue();
9866
9867 MVT VT = BV->getSimpleValueType(0);
9868 SDLoc DL(BV);
9869
9870 // Try to generate X86ISD::FMADDSUB node here.
9871 SDValue Opnd2;
9872 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9873 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9874 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9875 }
9876
9877 // We only support ADDSUB.
9878 if (IsSubAdd)
9879 return SDValue();
9880
9881 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9882 // the ADDSUB idiom has been successfully recognized. There are no known
9883 // X86 targets with 512-bit ADDSUB instructions!
9884 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9885 // recognition.
9886 if (VT.is512BitVector())
9887 return SDValue();
9888
9889 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9890}
9891
9892static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9893 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9894 // Initialize outputs to known values.
9895 MVT VT = BV->getSimpleValueType(0);
9896 HOpcode = ISD::DELETED_NODE;
9897 V0 = DAG.getUNDEF(VT);
9898 V1 = DAG.getUNDEF(VT);
9899
9900 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9901 // half of the result is calculated independently from the 128-bit halves of
9902 // the inputs, so that makes the index-checking logic below more complicated.
9903 unsigned NumElts = VT.getVectorNumElements();
9904 unsigned GenericOpcode = ISD::DELETED_NODE;
9905 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9906 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9907 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9908 for (unsigned i = 0; i != Num128BitChunks; ++i) {
9909 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9910 // Ignore undef elements.
9911 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9912 if (Op.isUndef())
9913 continue;
9914
9915 // If there's an opcode mismatch, we're done.
9916 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9917 return false;
9918
9919 // Initialize horizontal opcode.
9920 if (HOpcode == ISD::DELETED_NODE) {
9921 GenericOpcode = Op.getOpcode();
9922 switch (GenericOpcode) {
9923 case ISD::ADD: HOpcode = X86ISD::HADD; break;
9924 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9925 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9926 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9927 default: return false;
9928 }
9929 }
9930
9931 SDValue Op0 = Op.getOperand(0);
9932 SDValue Op1 = Op.getOperand(1);
9933 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9934 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9935 Op0.getOperand(0) != Op1.getOperand(0) ||
9936 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9937 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9938 return false;
9939
9940 // The source vector is chosen based on which 64-bit half of the
9941 // destination vector is being calculated.
9942 if (j < NumEltsIn64Bits) {
9943 if (V0.isUndef())
9944 V0 = Op0.getOperand(0);
9945 } else {
9946 if (V1.isUndef())
9947 V1 = Op0.getOperand(0);
9948 }
9949
9950 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9951 if (SourceVec != Op0.getOperand(0))
9952 return false;
9953
9954 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9955 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9956 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9957 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9958 (j % NumEltsIn64Bits) * 2;
9959 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9960 continue;
9961
9962 // If this is not a commutative op, this does not match.
9963 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9964 return false;
9965
9966 // Addition is commutative, so try swapping the extract indexes.
9967 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9968 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9969 continue;
9970
9971 // Extract indexes do not match horizontal requirement.
9972 return false;
9973 }
9974 }
9975 // We matched. Opcode and operands are returned by reference as arguments.
9976 return true;
9977}
9978
9979static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9980 SelectionDAG &DAG, unsigned HOpcode,
9981 SDValue V0, SDValue V1) {
9982 // If either input vector is not the same size as the build vector,
9983 // extract/insert the low bits to the correct size.
9984 // This is free (examples: zmm --> xmm, xmm --> ymm).
9985 MVT VT = BV->getSimpleValueType(0);
9986 unsigned Width = VT.getSizeInBits();
9987 if (V0.getValueSizeInBits() > Width)
9988 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9989 else if (V0.getValueSizeInBits() < Width)
9990 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9991
9992 if (V1.getValueSizeInBits() > Width)
9993 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9994 else if (V1.getValueSizeInBits() < Width)
9995 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9996
9997 unsigned NumElts = VT.getVectorNumElements();
9998 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9999 for (unsigned i = 0; i != NumElts; ++i)
10000 if (BV->getOperand(i).isUndef())
10001 DemandedElts.clearBit(i);
10002
10003 // If we don't need the upper xmm, then perform as a xmm hop.
10004 unsigned HalfNumElts = NumElts / 2;
10005 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
10006 MVT HalfVT = VT.getHalfNumVectorElementsVT();
10007 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
10008 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
10009 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
10010 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
10011 }
10012
10013 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
10014}
10015
10016/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
10017static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
10018 const X86Subtarget &Subtarget,
10019 SelectionDAG &DAG) {
10020 // We need at least 2 non-undef elements to make this worthwhile by default.
10021 unsigned NumNonUndefs =
10022 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
10023 if (NumNonUndefs < 2)
10024 return SDValue();
10025
10026 // There are 4 sets of horizontal math operations distinguished by type:
10027 // int/FP at 128-bit/256-bit. Each type was introduced with a different
10028 // subtarget feature. Try to match those "native" patterns first.
10029 MVT VT = BV->getSimpleValueType(0);
10030 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
10031 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
10032 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
10033 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
10034 unsigned HOpcode;
10035 SDValue V0, V1;
10036 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
10037 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
10038 }
10039
10040 // Try harder to match 256-bit ops by using extract/concat.
10041 if (!Subtarget.hasAVX() || !VT.is256BitVector())
10042 return SDValue();
10043
10044 // Count the number of UNDEF operands in the build_vector in input.
10045 unsigned NumElts = VT.getVectorNumElements();
10046 unsigned Half = NumElts / 2;
10047 unsigned NumUndefsLO = 0;
10048 unsigned NumUndefsHI = 0;
10049 for (unsigned i = 0, e = Half; i != e; ++i)
10050 if (BV->getOperand(i)->isUndef())
10051 NumUndefsLO++;
10052
10053 for (unsigned i = Half, e = NumElts; i != e; ++i)
10054 if (BV->getOperand(i)->isUndef())
10055 NumUndefsHI++;
10056
10057 SDLoc DL(BV);
10058 SDValue InVec0, InVec1;
10059 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
10060 SDValue InVec2, InVec3;
10061 unsigned X86Opcode;
10062 bool CanFold = true;
10063
10064 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
10065 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
10066 InVec3) &&
10067 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10068 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10069 X86Opcode = X86ISD::HADD;
10070 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
10071 InVec1) &&
10072 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
10073 InVec3) &&
10074 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
10075 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
10076 X86Opcode = X86ISD::HSUB;
10077 else
10078 CanFold = false;
10079
10080 if (CanFold) {
10081 // Do not try to expand this build_vector into a pair of horizontal
10082 // add/sub if we can emit a pair of scalar add/sub.
10083 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10084 return SDValue();
10085
10086 // Convert this build_vector into a pair of horizontal binops followed by
10087 // a concat vector. We must adjust the outputs from the partial horizontal
10088 // matching calls above to account for undefined vector halves.
10089 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
10090 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
10091 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10091, __extension__ __PRETTY_FUNCTION__))
;
10092 bool isUndefLO = NumUndefsLO == Half;
10093 bool isUndefHI = NumUndefsHI == Half;
10094 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
10095 isUndefHI);
10096 }
10097 }
10098
10099 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
10100 VT == MVT::v16i16) {
10101 unsigned X86Opcode;
10102 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
10103 X86Opcode = X86ISD::HADD;
10104 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
10105 InVec1))
10106 X86Opcode = X86ISD::HSUB;
10107 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
10108 InVec1))
10109 X86Opcode = X86ISD::FHADD;
10110 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
10111 InVec1))
10112 X86Opcode = X86ISD::FHSUB;
10113 else
10114 return SDValue();
10115
10116 // Don't try to expand this build_vector into a pair of horizontal add/sub
10117 // if we can simply emit a pair of scalar add/sub.
10118 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
10119 return SDValue();
10120
10121 // Convert this build_vector into two horizontal add/sub followed by
10122 // a concat vector.
10123 bool isUndefLO = NumUndefsLO == Half;
10124 bool isUndefHI = NumUndefsHI == Half;
10125 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
10126 isUndefLO, isUndefHI);
10127 }
10128
10129 return SDValue();
10130}
10131
10132static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
10133 SelectionDAG &DAG);
10134
10135/// If a BUILD_VECTOR's source elements all apply the same bit operation and
10136/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
10137/// just apply the bit to the vectors.
10138/// NOTE: Its not in our interest to start make a general purpose vectorizer
10139/// from this, but enough scalar bit operations are created from the later
10140/// legalization + scalarization stages to need basic support.
10141static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
10142 const X86Subtarget &Subtarget,
10143 SelectionDAG &DAG) {
10144 SDLoc DL(Op);
10145 MVT VT = Op->getSimpleValueType(0);
10146 unsigned NumElems = VT.getVectorNumElements();
10147 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10148
10149 // Check that all elements have the same opcode.
10150 // TODO: Should we allow UNDEFS and if so how many?
10151 unsigned Opcode = Op->getOperand(0).getOpcode();
10152 for (unsigned i = 1; i < NumElems; ++i)
10153 if (Opcode != Op->getOperand(i).getOpcode())
10154 return SDValue();
10155
10156 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
10157 bool IsShift = false;
10158 switch (Opcode) {
10159 default:
10160 return SDValue();
10161 case ISD::SHL:
10162 case ISD::SRL:
10163 case ISD::SRA:
10164 IsShift = true;
10165 break;
10166 case ISD::AND:
10167 case ISD::XOR:
10168 case ISD::OR:
10169 // Don't do this if the buildvector is a splat - we'd replace one
10170 // constant with an entire vector.
10171 if (Op->getSplatValue())
10172 return SDValue();
10173 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
10174 return SDValue();
10175 break;
10176 }
10177
10178 SmallVector<SDValue, 4> LHSElts, RHSElts;
10179 for (SDValue Elt : Op->ops()) {
10180 SDValue LHS = Elt.getOperand(0);
10181 SDValue RHS = Elt.getOperand(1);
10182
10183 // We expect the canonicalized RHS operand to be the constant.
10184 if (!isa<ConstantSDNode>(RHS))
10185 return SDValue();
10186
10187 // Extend shift amounts.
10188 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
10189 if (!IsShift)
10190 return SDValue();
10191 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
10192 }
10193
10194 LHSElts.push_back(LHS);
10195 RHSElts.push_back(RHS);
10196 }
10197
10198 // Limit to shifts by uniform immediates.
10199 // TODO: Only accept vXi8/vXi64 special cases?
10200 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
10201 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
10202 return SDValue();
10203
10204 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
10205 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
10206 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
10207
10208 if (!IsShift)
10209 return Res;
10210
10211 // Immediately lower the shift to ensure the constant build vector doesn't
10212 // get converted to a constant pool before the shift is lowered.
10213 return LowerShift(Res, Subtarget, DAG);
10214}
10215
10216/// Create a vector constant without a load. SSE/AVX provide the bare minimum
10217/// functionality to do this, so it's all zeros, all ones, or some derivation
10218/// that is cheap to calculate.
10219static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
10220 const X86Subtarget &Subtarget) {
10221 SDLoc DL(Op);
10222 MVT VT = Op.getSimpleValueType();
10223
10224 // Vectors containing all zeros can be matched by pxor and xorps.
10225 if (ISD::isBuildVectorAllZeros(Op.getNode()))
10226 return Op;
10227
10228 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
10229 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
10230 // vpcmpeqd on 256-bit vectors.
10231 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
10232 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
10233 return Op;
10234
10235 return getOnesVector(VT, DAG, DL);
10236 }
10237
10238 return SDValue();
10239}
10240
10241/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
10242/// from a vector of source values and a vector of extraction indices.
10243/// The vectors might be manipulated to match the type of the permute op.
10244static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
10245 SDLoc &DL, SelectionDAG &DAG,
10246 const X86Subtarget &Subtarget) {
10247 MVT ShuffleVT = VT;
10248 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10249 unsigned NumElts = VT.getVectorNumElements();
10250 unsigned SizeInBits = VT.getSizeInBits();
10251
10252 // Adjust IndicesVec to match VT size.
10253 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10254, __extension__ __PRETTY_FUNCTION__))
10254 "Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10254, __extension__ __PRETTY_FUNCTION__))
;
10255 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
10256 // Narrow/widen the indices vector to the correct size.
10257 if (IndicesVec.getValueSizeInBits() > SizeInBits)
10258 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
10259 NumElts * VT.getScalarSizeInBits());
10260 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
10261 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
10262 SDLoc(IndicesVec), SizeInBits);
10263 // Zero-extend the index elements within the vector.
10264 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
10265 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
10266 IndicesVT, IndicesVec);
10267 }
10268 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
10269
10270 // Handle SrcVec that don't match VT type.
10271 if (SrcVec.getValueSizeInBits() != SizeInBits) {
10272 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
10273 // Handle larger SrcVec by treating it as a larger permute.
10274 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
10275 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
10276 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
10277 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
10278 Subtarget, DAG, SDLoc(IndicesVec));
10279 SDValue NewSrcVec =
10280 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10281 if (NewSrcVec)
10282 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
10283 return SDValue();
10284 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
10285 // Widen smaller SrcVec to match VT.
10286 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
10287 } else
10288 return SDValue();
10289 }
10290
10291 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
10292 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10292, __extension__ __PRETTY_FUNCTION__))
;
10293 EVT SrcVT = Idx.getValueType();
10294 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
10295 uint64_t IndexScale = 0;
10296 uint64_t IndexOffset = 0;
10297
10298 // If we're scaling a smaller permute op, then we need to repeat the
10299 // indices, scaling and offsetting them as well.
10300 // e.g. v4i32 -> v16i8 (Scale = 4)
10301 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
10302 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
10303 for (uint64_t i = 0; i != Scale; ++i) {
10304 IndexScale |= Scale << (i * NumDstBits);
10305 IndexOffset |= i << (i * NumDstBits);
10306 }
10307
10308 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
10309 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
10310 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
10311 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
10312 return Idx;
10313 };
10314
10315 unsigned Opcode = 0;
10316 switch (VT.SimpleTy) {
10317 default:
10318 break;
10319 case MVT::v16i8:
10320 if (Subtarget.hasSSSE3())
10321 Opcode = X86ISD::PSHUFB;
10322 break;
10323 case MVT::v8i16:
10324 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10325 Opcode = X86ISD::VPERMV;
10326 else if (Subtarget.hasSSSE3()) {
10327 Opcode = X86ISD::PSHUFB;
10328 ShuffleVT = MVT::v16i8;
10329 }
10330 break;
10331 case MVT::v4f32:
10332 case MVT::v4i32:
10333 if (Subtarget.hasAVX()) {
10334 Opcode = X86ISD::VPERMILPV;
10335 ShuffleVT = MVT::v4f32;
10336 } else if (Subtarget.hasSSSE3()) {
10337 Opcode = X86ISD::PSHUFB;
10338 ShuffleVT = MVT::v16i8;
10339 }
10340 break;
10341 case MVT::v2f64:
10342 case MVT::v2i64:
10343 if (Subtarget.hasAVX()) {
10344 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
10345 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10346 Opcode = X86ISD::VPERMILPV;
10347 ShuffleVT = MVT::v2f64;
10348 } else if (Subtarget.hasSSE41()) {
10349 // SSE41 can compare v2i64 - select between indices 0 and 1.
10350 return DAG.getSelectCC(
10351 DL, IndicesVec,
10352 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
10353 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
10354 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
10355 ISD::CondCode::SETEQ);
10356 }
10357 break;
10358 case MVT::v32i8:
10359 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
10360 Opcode = X86ISD::VPERMV;
10361 else if (Subtarget.hasXOP()) {
10362 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
10363 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
10364 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
10365 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
10366 return DAG.getNode(
10367 ISD::CONCAT_VECTORS, DL, VT,
10368 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
10369 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
10370 } else if (Subtarget.hasAVX()) {
10371 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
10372 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
10373 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
10374 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
10375 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
10376 ArrayRef<SDValue> Ops) {
10377 // Permute Lo and Hi and then select based on index range.
10378 // This works as SHUFB uses bits[3:0] to permute elements and we don't
10379 // care about the bit[7] as its just an index vector.
10380 SDValue Idx = Ops[2];
10381 EVT VT = Idx.getValueType();
10382 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
10383 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
10384 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
10385 ISD::CondCode::SETGT);
10386 };
10387 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
10388 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
10389 PSHUFBBuilder);
10390 }
10391 break;
10392 case MVT::v16i16:
10393 if (Subtarget.hasVLX() && Subtarget.hasBWI())
10394 Opcode = X86ISD::VPERMV;
10395 else if (Subtarget.hasAVX()) {
10396 // Scale to v32i8 and perform as v32i8.
10397 IndicesVec = ScaleIndices(IndicesVec, 2);
10398 return DAG.getBitcast(
10399 VT, createVariablePermute(
10400 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
10401 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
10402 }
10403 break;
10404 case MVT::v8f32:
10405 case MVT::v8i32:
10406 if (Subtarget.hasAVX2())
10407 Opcode = X86ISD::VPERMV;
10408 else if (Subtarget.hasAVX()) {
10409 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
10410 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10411 {0, 1, 2, 3, 0, 1, 2, 3});
10412 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
10413 {4, 5, 6, 7, 4, 5, 6, 7});
10414 if (Subtarget.hasXOP())
10415 return DAG.getBitcast(
10416 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
10417 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10418 // Permute Lo and Hi and then select based on index range.
10419 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
10420 SDValue Res = DAG.getSelectCC(
10421 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
10422 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
10423 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
10424 ISD::CondCode::SETGT);
10425 return DAG.getBitcast(VT, Res);
10426 }
10427 break;
10428 case MVT::v4i64:
10429 case MVT::v4f64:
10430 if (Subtarget.hasAVX512()) {
10431 if (!Subtarget.hasVLX()) {
10432 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
10433 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
10434 SDLoc(SrcVec));
10435 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
10436 DAG, SDLoc(IndicesVec));
10437 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
10438 DAG, Subtarget);
10439 return extract256BitVector(Res, 0, DAG, DL);
10440 }
10441 Opcode = X86ISD::VPERMV;
10442 } else if (Subtarget.hasAVX()) {
10443 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
10444 SDValue LoLo =
10445 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
10446 SDValue HiHi =
10447 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
10448 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
10449 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
10450 if (Subtarget.hasXOP())
10451 return DAG.getBitcast(
10452 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
10453 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
10454 // Permute Lo and Hi and then select based on index range.
10455 // This works as VPERMILPD only uses index bit[1] to permute elements.
10456 SDValue Res = DAG.getSelectCC(
10457 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
10458 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
10459 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
10460 ISD::CondCode::SETGT);
10461 return DAG.getBitcast(VT, Res);
10462 }
10463 break;
10464 case MVT::v64i8:
10465 if (Subtarget.hasVBMI())
10466 Opcode = X86ISD::VPERMV;
10467 break;
10468 case MVT::v32i16:
10469 if (Subtarget.hasBWI())
10470 Opcode = X86ISD::VPERMV;
10471 break;
10472 case MVT::v16f32:
10473 case MVT::v16i32:
10474 case MVT::v8f64:
10475 case MVT::v8i64:
10476 if (Subtarget.hasAVX512())
10477 Opcode = X86ISD::VPERMV;
10478 break;
10479 }
10480 if (!Opcode)
10481 return SDValue();
10482
10483 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10485, __extension__ __PRETTY_FUNCTION__))
10484 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10485, __extension__ __PRETTY_FUNCTION__))
10485 "Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10485, __extension__ __PRETTY_FUNCTION__))
;
10486
10487 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
10488 if (Scale > 1)
10489 IndicesVec = ScaleIndices(IndicesVec, Scale);
10490
10491 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
10492 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
10493
10494 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
10495 SDValue Res = Opcode == X86ISD::VPERMV
10496 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
10497 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
10498 return DAG.getBitcast(VT, Res);
10499}
10500
10501// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
10502// reasoned to be a permutation of a vector by indices in a non-constant vector.
10503// (build_vector (extract_elt V, (extract_elt I, 0)),
10504// (extract_elt V, (extract_elt I, 1)),
10505// ...
10506// ->
10507// (vpermv I, V)
10508//
10509// TODO: Handle undefs
10510// TODO: Utilize pshufb and zero mask blending to support more efficient
10511// construction of vectors with constant-0 elements.
10512static SDValue
10513LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
10514 const X86Subtarget &Subtarget) {
10515 SDValue SrcVec, IndicesVec;
10516 // Check for a match of the permute source vector and permute index elements.
10517 // This is done by checking that the i-th build_vector operand is of the form:
10518 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
10519 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
10520 SDValue Op = V.getOperand(Idx);
10521 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10522 return SDValue();
10523
10524 // If this is the first extract encountered in V, set the source vector,
10525 // otherwise verify the extract is from the previously defined source
10526 // vector.
10527 if (!SrcVec)
10528 SrcVec = Op.getOperand(0);
10529 else if (SrcVec != Op.getOperand(0))
10530 return SDValue();
10531 SDValue ExtractedIndex = Op->getOperand(1);
10532 // Peek through extends.
10533 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
10534 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
10535 ExtractedIndex = ExtractedIndex.getOperand(0);
10536 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10537 return SDValue();
10538
10539 // If this is the first extract from the index vector candidate, set the
10540 // indices vector, otherwise verify the extract is from the previously
10541 // defined indices vector.
10542 if (!IndicesVec)
10543 IndicesVec = ExtractedIndex.getOperand(0);
10544 else if (IndicesVec != ExtractedIndex.getOperand(0))
10545 return SDValue();
10546
10547 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
10548 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
10549 return SDValue();
10550 }
10551
10552 SDLoc DL(V);
10553 MVT VT = V.getSimpleValueType();
10554 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
10555}
10556
10557SDValue
10558X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
10559 SDLoc dl(Op);
10560
10561 MVT VT = Op.getSimpleValueType();
10562 MVT EltVT = VT.getVectorElementType();
10563 unsigned NumElems = Op.getNumOperands();
10564
10565 // Generate vectors for predicate vectors.
10566 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
10567 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
10568
10569 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
10570 return VectorConstant;
10571
10572 unsigned EVTBits = EltVT.getSizeInBits();
10573 APInt UndefMask = APInt::getNullValue(NumElems);
10574 APInt ZeroMask = APInt::getNullValue(NumElems);
10575 APInt NonZeroMask = APInt::getNullValue(NumElems);
10576 bool IsAllConstants = true;
10577 SmallSet<SDValue, 8> Values;
10578 unsigned NumConstants = NumElems;
10579 for (unsigned i = 0; i < NumElems; ++i) {
10580 SDValue Elt = Op.getOperand(i);
10581 if (Elt.isUndef()) {
10582 UndefMask.setBit(i);
10583 continue;
10584 }
10585 Values.insert(Elt);
10586 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
10587 IsAllConstants = false;
10588 NumConstants--;
10589 }
10590 if (X86::isZeroNode(Elt)) {
10591 ZeroMask.setBit(i);
10592 } else {
10593 NonZeroMask.setBit(i);
10594 }
10595 }
10596
10597 // All undef vector. Return an UNDEF. All zero vectors were handled above.
10598 if (NonZeroMask == 0) {
10599 assert(UndefMask.isAllOnesValue() && "Fully undef mask expected")(static_cast <bool> (UndefMask.isAllOnesValue() &&
"Fully undef mask expected") ? void (0) : __assert_fail ("UndefMask.isAllOnesValue() && \"Fully undef mask expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10599, __extension__ __PRETTY_FUNCTION__))
;
10600 return DAG.getUNDEF(VT);
10601 }
10602
10603 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
10604
10605 // If the upper elts of a ymm/zmm are undef/zero then we might be better off
10606 // lowering to a smaller build vector and padding with undef/zero.
10607 if ((VT.is256BitVector() || VT.is512BitVector()) &&
10608 !isFoldableUseOfShuffle(BV)) {
10609 unsigned UpperElems = NumElems / 2;
10610 APInt UndefOrZeroMask = UndefMask | ZeroMask;
10611 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
10612 if (NumUpperUndefsOrZeros >= UpperElems) {
10613 if (VT.is512BitVector() &&
10614 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
10615 UpperElems = NumElems - (NumElems / 4);
10616 bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
10617 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
10618 SDValue NewBV =
10619 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
10620 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
10621 }
10622 }
10623
10624 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
10625 return AddSub;
10626 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
10627 return HorizontalOp;
10628 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
10629 return Broadcast;
10630 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
10631 return BitOp;
10632
10633 unsigned NumZero = ZeroMask.countPopulation();
10634 unsigned NumNonZero = NonZeroMask.countPopulation();
10635
10636 // If we are inserting one variable into a vector of non-zero constants, try
10637 // to avoid loading each constant element as a scalar. Load the constants as a
10638 // vector and then insert the variable scalar element. If insertion is not
10639 // supported, fall back to a shuffle to get the scalar blended with the
10640 // constants. Insertion into a zero vector is handled as a special-case
10641 // somewhere below here.
10642 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10643 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10644 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10645 // Create an all-constant vector. The variable element in the old
10646 // build vector is replaced by undef in the constant vector. Save the
10647 // variable scalar element and its index for use in the insertelement.
10648 LLVMContext &Context = *DAG.getContext();
10649 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10650 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10651 SDValue VarElt;
10652 SDValue InsIndex;
10653 for (unsigned i = 0; i != NumElems; ++i) {
10654 SDValue Elt = Op.getOperand(i);
10655 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10656 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10657 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10658 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10659 else if (!Elt.isUndef()) {
10660 assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10661, __extension__ __PRETTY_FUNCTION__))
10661 "Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10661, __extension__ __PRETTY_FUNCTION__))
;
10662 VarElt = Elt;
10663 InsIndex = DAG.getVectorIdxConstant(i, dl);
10664 }
10665 }
10666 Constant *CV = ConstantVector::get(ConstVecOps);
10667 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10668
10669 // The constants we just created may not be legal (eg, floating point). We
10670 // must lower the vector right here because we can not guarantee that we'll
10671 // legalize it before loading it. This is also why we could not just create
10672 // a new build vector here. If the build vector contains illegal constants,
10673 // it could get split back up into a series of insert elements.
10674 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10675 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10676 MachineFunction &MF = DAG.getMachineFunction();
10677 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10678 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10679 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10680 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10681 if (InsertC < NumEltsInLow128Bits)
10682 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10683
10684 // There's no good way to insert into the high elements of a >128-bit
10685 // vector, so use shuffles to avoid an extract/insert sequence.
10686 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10686, __extension__ __PRETTY_FUNCTION__))
;
10687 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10687, __extension__ __PRETTY_FUNCTION__))
;
10688 SmallVector<int, 8> ShuffleMask;
10689 unsigned NumElts = VT.getVectorNumElements();
10690 for (unsigned i = 0; i != NumElts; ++i)
10691 ShuffleMask.push_back(i == InsertC ? NumElts : i);
10692 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10693 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10694 }
10695
10696 // Special case for single non-zero, non-undef, element.
10697 if (NumNonZero == 1) {
10698 unsigned Idx = NonZeroMask.countTrailingZeros();
10699 SDValue Item = Op.getOperand(Idx);
10700
10701 // If we have a constant or non-constant insertion into the low element of
10702 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10703 // the rest of the elements. This will be matched as movd/movq/movss/movsd
10704 // depending on what the source datatype is.
10705 if (Idx == 0) {
10706 if (NumZero == 0)
10707 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10708
10709 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
10710 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
10711 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
10712 assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10714, __extension__ __PRETTY_FUNCTION__))
10713 VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10714, __extension__ __PRETTY_FUNCTION__))
10714 "Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10714, __extension__ __PRETTY_FUNCTION__))
;
10715 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10716 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
10717 // zero vector.
10718 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10719 }
10720
10721 // We can't directly insert an i8 or i16 into a vector, so zero extend
10722 // it to i32 first.
10723 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10724 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10725 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
10726 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10727 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10728 return DAG.getBitcast(VT, Item);
10729 }
10730 }
10731
10732 // Is it a vector logical left shift?
10733 if (NumElems == 2 && Idx == 1 &&
10734 X86::isZeroNode(Op.getOperand(0)) &&
10735 !X86::isZeroNode(Op.getOperand(1))) {
10736 unsigned NumBits = VT.getSizeInBits();
10737 return getVShift(true, VT,
10738 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10739 VT, Op.getOperand(1)),
10740 NumBits/2, DAG, *this, dl);
10741 }
10742
10743 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10744 return SDValue();
10745
10746 // Otherwise, if this is a vector with i32 or f32 elements, and the element
10747 // is a non-constant being inserted into an element other than the low one,
10748 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
10749 // movd/movss) to move this into the low element, then shuffle it into
10750 // place.
10751 if (EVTBits == 32) {
10752 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10753 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10754 }
10755 }
10756
10757 // Splat is obviously ok. Let legalizer expand it to a shuffle.
10758 if (Values.size() == 1) {
10759 if (EVTBits == 32) {
10760 // Instead of a shuffle like this:
10761 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10762 // Check if it's possible to issue this instead.
10763 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10764 unsigned Idx = NonZeroMask.countTrailingZeros();
10765 SDValue Item = Op.getOperand(Idx);
10766 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10767 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10768 }
10769 return SDValue();
10770 }
10771
10772 // A vector full of immediates; various special cases are already
10773 // handled, so this is best done with a single constant-pool load.
10774 if (IsAllConstants)
10775 return SDValue();
10776
10777 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10778 return V;
10779
10780 // See if we can use a vector load to get all of the elements.
10781 {
10782 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10783 if (SDValue LD =
10784 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10785 return LD;
10786 }
10787
10788 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10789 // build_vector and broadcast it.
10790 // TODO: We could probably generalize this more.
10791 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10792 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10793 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10794 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10795 // Make sure all the even/odd operands match.
10796 for (unsigned i = 2; i != NumElems; ++i)
10797 if (Ops[i % 2] != Op.getOperand(i))
10798 return false;
10799 return true;
10800 };
10801 if (CanSplat(Op, NumElems, Ops)) {
10802 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10803 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10804 // Create a new build vector and cast to v2i64/v2f64.
10805 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10806 DAG.getBuildVector(NarrowVT, dl, Ops));
10807 // Broadcast from v2i64/v2f64 and cast to final VT.
10808 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
10809 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10810 NewBV));
10811 }
10812 }
10813
10814 // For AVX-length vectors, build the individual 128-bit pieces and use
10815 // shuffles to put them in place.
10816 if (VT.getSizeInBits() > 128) {
10817 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
10818
10819 // Build both the lower and upper subvector.
10820 SDValue Lower =
10821 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10822 SDValue Upper = DAG.getBuildVector(
10823 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10824
10825 // Recreate the wider vector with the lower and upper part.
10826 return concatSubVectors(Lower, Upper, DAG, dl);
10827 }
10828
10829 // Let legalizer expand 2-wide build_vectors.
10830 if (EVTBits == 64) {
10831 if (NumNonZero == 1) {
10832 // One half is zero or undef.
10833 unsigned Idx = NonZeroMask.countTrailingZeros();
10834 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10835 Op.getOperand(Idx));
10836 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10837 }
10838 return SDValue();
10839 }
10840
10841 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10842 if (EVTBits == 8 && NumElems == 16)
10843 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,
10844 DAG, Subtarget))
10845 return V;
10846
10847 if (EltVT == MVT::i16 && NumElems == 8)
10848 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
10849 DAG, Subtarget))
10850 return V;
10851
10852 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10853 if (EVTBits == 32 && NumElems == 4)
10854 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10855 return V;
10856
10857 // If element VT is == 32 bits, turn it into a number of shuffles.
10858 if (NumElems == 4 && NumZero > 0) {
10859 SmallVector<SDValue, 8> Ops(NumElems);
10860 for (unsigned i = 0; i < 4; ++i) {
10861 bool isZero = !NonZeroMask[i];
10862 if (isZero)
10863 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10864 else
10865 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10866 }
10867
10868 for (unsigned i = 0; i < 2; ++i) {
10869 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
10870 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10870)
;
10871 case 0:
10872 Ops[i] = Ops[i*2]; // Must be a zero vector.
10873 break;
10874 case 1:
10875 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10876 break;
10877 case 2:
10878 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10879 break;
10880 case 3:
10881 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10882 break;
10883 }
10884 }
10885
10886 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
10887 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
10888 int MaskVec[] = {
10889 Reverse1 ? 1 : 0,
10890 Reverse1 ? 0 : 1,
10891 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10892 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10893 };
10894 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10895 }
10896
10897 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10897, __extension__ __PRETTY_FUNCTION__))
;
10898
10899 // Check for a build vector from mostly shuffle plus few inserting.
10900 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10901 return Sh;
10902
10903 // For SSE 4.1, use insertps to put the high elements into the low element.
10904 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
10905 SDValue Result;
10906 if (!Op.getOperand(0).isUndef())
10907 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10908 else
10909 Result = DAG.getUNDEF(VT);
10910
10911 for (unsigned i = 1; i < NumElems; ++i) {
10912 if (Op.getOperand(i).isUndef()) continue;
10913 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10914 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10915 }
10916 return Result;
10917 }
10918
10919 // Otherwise, expand into a number of unpckl*, start by extending each of
10920 // our (non-undef) elements to the full vector width with the element in the
10921 // bottom slot of the vector (which generates no code for SSE).
10922 SmallVector<SDValue, 8> Ops(NumElems);
10923 for (unsigned i = 0; i < NumElems; ++i) {
10924 if (!Op.getOperand(i).isUndef())
10925 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10926 else
10927 Ops[i] = DAG.getUNDEF(VT);
10928 }
10929
10930 // Next, we iteratively mix elements, e.g. for v4f32:
10931 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10932 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10933 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10934 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10935 // Generate scaled UNPCKL shuffle mask.
10936 SmallVector<int, 16> Mask;
10937 for(unsigned i = 0; i != Scale; ++i)
10938 Mask.push_back(i);
10939 for (unsigned i = 0; i != Scale; ++i)
10940 Mask.push_back(NumElems+i);
10941 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10942
10943 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10944 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10945 }
10946 return Ops[0];
10947}
10948
10949// 256-bit AVX can use the vinsertf128 instruction
10950// to create 256-bit vectors from two other 128-bit ones.
10951// TODO: Detect subvector broadcast here instead of DAG combine?
10952static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10953 const X86Subtarget &Subtarget) {
10954 SDLoc dl(Op);
10955 MVT ResVT = Op.getSimpleValueType();
10956
10957 assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10958, __extension__ __PRETTY_FUNCTION__))
10958 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10958, __extension__ __PRETTY_FUNCTION__))
;
10959
10960 unsigned NumOperands = Op.getNumOperands();
10961 unsigned NumZero = 0;
10962 unsigned NumNonZero = 0;
10963 unsigned NonZeros = 0;
10964 for (unsigned i = 0; i != NumOperands; ++i) {
10965 SDValue SubVec = Op.getOperand(i);
10966 if (SubVec.isUndef())
10967 continue;
10968 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10969 ++NumZero;
10970 else {
10971 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10971, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
10972 NonZeros |= 1 << i;
10973 ++NumNonZero;
10974 }
10975 }
10976
10977 // If we have more than 2 non-zeros, build each half separately.
10978 if (NumNonZero > 2) {
10979 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10980 ArrayRef<SDUse> Ops = Op->ops();
10981 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10982 Ops.slice(0, NumOperands/2));
10983 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10984 Ops.slice(NumOperands/2));
10985 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10986 }
10987
10988 // Otherwise, build it up through insert_subvectors.
10989 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10990 : DAG.getUNDEF(ResVT);
10991
10992 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10993 unsigned NumSubElems = SubVT.getVectorNumElements();
10994 for (unsigned i = 0; i != NumOperands; ++i) {
10995 if ((NonZeros & (1 << i)) == 0)
10996 continue;
10997
10998 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10999 Op.getOperand(i),
11000 DAG.getIntPtrConstant(i * NumSubElems, dl));
11001 }
11002
11003 return Vec;
11004}
11005
11006// Returns true if the given node is a type promotion (by concatenating i1
11007// zeros) of the result of a node that already zeros all upper bits of
11008// k-register.
11009// TODO: Merge this with LowerAVXCONCAT_VECTORS?
11010static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
11011 const X86Subtarget &Subtarget,
11012 SelectionDAG & DAG) {
11013 SDLoc dl(Op);
11014 MVT ResVT = Op.getSimpleValueType();
11015 unsigned NumOperands = Op.getNumOperands();
11016
11017 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11018, __extension__ __PRETTY_FUNCTION__))
11018 "Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11018, __extension__ __PRETTY_FUNCTION__))
;
11019
11020 uint64_t Zeros = 0;
11021 uint64_t NonZeros = 0;
11022 for (unsigned i = 0; i != NumOperands; ++i) {
11023 SDValue SubVec = Op.getOperand(i);
11024 if (SubVec.isUndef())
11025 continue;
11026 assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11026, __extension__ __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
11027 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
11028 Zeros |= (uint64_t)1 << i;
11029 else
11030 NonZeros |= (uint64_t)1 << i;
11031 }
11032
11033 unsigned NumElems = ResVT.getVectorNumElements();
11034
11035 // If we are inserting non-zero vector and there are zeros in LSBs and undef
11036 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
11037 // insert_subvector will give us two kshifts.
11038 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
11039 Log2_64(NonZeros) != NumOperands - 1) {
11040 MVT ShiftVT = ResVT;
11041 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
11042 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
11043 unsigned Idx = Log2_64(NonZeros);
11044 SDValue SubVec = Op.getOperand(Idx);
11045 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11046 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
11047 DAG.getUNDEF(ShiftVT), SubVec,
11048 DAG.getIntPtrConstant(0, dl));
11049 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
11050 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
11051 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
11052 DAG.getIntPtrConstant(0, dl));
11053 }
11054
11055 // If there are zero or one non-zeros we can handle this very simply.
11056 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
11057 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
11058 if (!NonZeros)
11059 return Vec;
11060 unsigned Idx = Log2_64(NonZeros);
11061 SDValue SubVec = Op.getOperand(Idx);
11062 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
11063 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
11064 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
11065 }
11066
11067 if (NumOperands > 2) {
11068 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
11069 ArrayRef<SDUse> Ops = Op->ops();
11070 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11071 Ops.slice(0, NumOperands/2));
11072 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
11073 Ops.slice(NumOperands/2));
11074 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
11075 }
11076
11077 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (countPopulation(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("countPopulation(NonZeros) == 2 && \"Simple cases not handled?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11077, __extension__ __PRETTY_FUNCTION__))
;
11078
11079 if (ResVT.getVectorNumElements() >= 16)
11080 return Op; // The operation is legal with KUNPCK
11081
11082 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
11083 DAG.getUNDEF(ResVT), Op.getOperand(0),
11084 DAG.getIntPtrConstant(0, dl));
11085 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
11086 DAG.getIntPtrConstant(NumElems/2, dl));
11087}
11088
11089static SDValue LowerCONCAT_VECTORS(SDValue Op,
11090 const X86Subtarget &Subtarget,
11091 SelectionDAG &DAG) {
11092 MVT VT = Op.getSimpleValueType();
11093 if (VT.getVectorElementType() == MVT::i1)
11094 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
11095
11096 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11098, __extension__ __PRETTY_FUNCTION__))
11097 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11098, __extension__ __PRETTY_FUNCTION__))
11098 Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11098, __extension__ __PRETTY_FUNCTION__))
;
11099
11100 // AVX can use the vinsertf128 instruction to create 256-bit vectors
11101 // from two other 128-bit ones.
11102
11103 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
11104 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
11105}
11106
11107//===----------------------------------------------------------------------===//
11108// Vector shuffle lowering
11109//
11110// This is an experimental code path for lowering vector shuffles on x86. It is
11111// designed to handle arbitrary vector shuffles and blends, gracefully
11112// degrading performance as necessary. It works hard to recognize idiomatic
11113// shuffles and lower them to optimal instruction patterns without leaving
11114// a framework that allows reasonably efficient handling of all vector shuffle
11115// patterns.
11116//===----------------------------------------------------------------------===//
11117
11118/// Tiny helper function to identify a no-op mask.
11119///
11120/// This is a somewhat boring predicate function. It checks whether the mask
11121/// array input, which is assumed to be a single-input shuffle mask of the kind
11122/// used by the X86 shuffle instructions (not a fully general
11123/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
11124/// in-place shuffle are 'no-op's.
11125static bool isNoopShuffleMask(ArrayRef<int> Mask) {
11126 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11127 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11127, __extension__ __PRETTY_FUNCTION__))
;
11128 if (Mask[i] >= 0 && Mask[i] != i)
11129 return false;
11130 }
11131 return true;
11132}
11133
11134/// Test whether there are elements crossing LaneSizeInBits lanes in this
11135/// shuffle mask.
11136///
11137/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
11138/// and we routinely test for these.
11139static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
11140 unsigned ScalarSizeInBits,
11141 ArrayRef<int> Mask) {
11142 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11144, __extension__ __PRETTY_FUNCTION__))
11143 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11144, __extension__ __PRETTY_FUNCTION__))
11144 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11144, __extension__ __PRETTY_FUNCTION__))
;
11145 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
11146 int Size = Mask.size();
11147 for (int i = 0; i < Size; ++i)
11148 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11149 return true;
11150 return false;
11151}
11152
11153/// Test whether there are elements crossing 128-bit lanes in this
11154/// shuffle mask.
11155static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
11156 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
11157}
11158
11159/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
11160/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
11161/// better support 'repeated mask + lane permute' style shuffles.
11162static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
11163 unsigned ScalarSizeInBits,
11164 ArrayRef<int> Mask) {
11165 assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11167, __extension__ __PRETTY_FUNCTION__))
11166 (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11167, __extension__ __PRETTY_FUNCTION__))
11167 "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11167, __extension__ __PRETTY_FUNCTION__))
;
11168 int NumElts = Mask.size();
11169 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
11170 int NumLanes = NumElts / NumEltsPerLane;
11171 if (NumLanes > 1) {
11172 for (int i = 0; i != NumLanes; ++i) {
11173 int SrcLane = -1;
11174 for (int j = 0; j != NumEltsPerLane; ++j) {
11175 int M = Mask[(i * NumEltsPerLane) + j];
11176 if (M < 0)
11177 continue;
11178 int Lane = (M % NumElts) / NumEltsPerLane;
11179 if (SrcLane >= 0 && SrcLane != Lane)
11180 return true;
11181 SrcLane = Lane;
11182 }
11183 }
11184 }
11185 return false;
11186}
11187
11188/// Test whether a shuffle mask is equivalent within each sub-lane.
11189///
11190/// This checks a shuffle mask to see if it is performing the same
11191/// lane-relative shuffle in each sub-lane. This trivially implies
11192/// that it is also not lane-crossing. It may however involve a blend from the
11193/// same lane of a second vector.
11194///
11195/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
11196/// non-trivial to compute in the face of undef lanes. The representation is
11197/// suitable for use with existing 128-bit shuffles as entries from the second
11198/// vector have been remapped to [LaneSize, 2*LaneSize).
11199static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
11200 ArrayRef<int> Mask,
11201 SmallVectorImpl<int> &RepeatedMask) {
11202 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
11203 RepeatedMask.assign(LaneSize, -1);
11204 int Size = Mask.size();
11205 for (int i = 0; i < Size; ++i) {
11206 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11206, __extension__ __PRETTY_FUNCTION__))
;
11207 if (Mask[i] < 0)
11208 continue;
11209 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11210 // This entry crosses lanes, so there is no way to model this shuffle.
11211 return false;
11212
11213 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
11214 // Adjust second vector indices to start at LaneSize instead of Size.
11215 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
11216 : Mask[i] % LaneSize + LaneSize;
11217 if (RepeatedMask[i % LaneSize] < 0)
11218 // This is the first non-undef entry in this slot of a 128-bit lane.
11219 RepeatedMask[i % LaneSize] = LocalM;
11220 else if (RepeatedMask[i % LaneSize] != LocalM)
11221 // Found a mismatch with the repeated mask.
11222 return false;
11223 }
11224 return true;
11225}
11226
11227/// Test whether a shuffle mask is equivalent within each 128-bit lane.
11228static bool
11229is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11230 SmallVectorImpl<int> &RepeatedMask) {
11231 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11232}
11233
11234static bool
11235is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
11236 SmallVector<int, 32> RepeatedMask;
11237 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
11238}
11239
11240/// Test whether a shuffle mask is equivalent within each 256-bit lane.
11241static bool
11242is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
11243 SmallVectorImpl<int> &RepeatedMask) {
11244 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
11245}
11246
11247/// Test whether a target shuffle mask is equivalent within each sub-lane.
11248/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11249static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
11250 unsigned EltSizeInBits,
11251 ArrayRef<int> Mask,
11252 SmallVectorImpl<int> &RepeatedMask) {
11253 int LaneSize = LaneSizeInBits / EltSizeInBits;
11254 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
11255 int Size = Mask.size();
11256 for (int i = 0; i < Size; ++i) {
11257 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11257, __extension__ __PRETTY_FUNCTION__))
;
11258 if (Mask[i] == SM_SentinelUndef)
11259 continue;
11260 if (Mask[i] == SM_SentinelZero) {
11261 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
11262 return false;
11263 RepeatedMask[i % LaneSize] = SM_SentinelZero;
11264 continue;
11265 }
11266 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
11267 // This entry crosses lanes, so there is no way to model this shuffle.
11268 return false;
11269
11270 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
11271 // later vector indices to start at multiples of LaneSize instead of Size.
11272 int LaneM = Mask[i] / Size;
11273 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
11274 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
11275 // This is the first non-undef entry in this slot of a 128-bit lane.
11276 RepeatedMask[i % LaneSize] = LocalM;
11277 else if (RepeatedMask[i % LaneSize] != LocalM)
11278 // Found a mismatch with the repeated mask.
11279 return false;
11280 }
11281 return true;
11282}
11283
11284/// Test whether a target shuffle mask is equivalent within each sub-lane.
11285/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
11286static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
11287 ArrayRef<int> Mask,
11288 SmallVectorImpl<int> &RepeatedMask) {
11289 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
11290 Mask, RepeatedMask);
11291}
11292
11293/// Checks whether the vector elements referenced by two shuffle masks are
11294/// equivalent.
11295static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
11296 int Idx, int ExpectedIdx) {
11297 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11298, __extension__ __PRETTY_FUNCTION__))
11298 ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11298, __extension__ __PRETTY_FUNCTION__))
;
11299 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
11300 return false;
11301
11302 switch (Op.getOpcode()) {
11303 case ISD::BUILD_VECTOR:
11304 // If the values are build vectors, we can look through them to find
11305 // equivalent inputs that make the shuffles equivalent.
11306 // TODO: Handle MaskSize != Op.getNumOperands()?
11307 if (MaskSize == (int)Op.getNumOperands() &&
11308 MaskSize == (int)ExpectedOp.getNumOperands())
11309 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
11310 break;
11311 case X86ISD::VBROADCAST:
11312 case X86ISD::VBROADCAST_LOAD:
11313 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
11314 return (Op == ExpectedOp &&
11315 (int)Op.getValueType().getVectorNumElements() == MaskSize);
11316 case X86ISD::HADD:
11317 case X86ISD::HSUB:
11318 case X86ISD::FHADD:
11319 case X86ISD::FHSUB:
11320 case X86ISD::PACKSS:
11321 case X86ISD::PACKUS:
11322 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
11323 // TODO: Handle MaskSize != NumElts?
11324 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
11325 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
11326 MVT VT = Op.getSimpleValueType();
11327 int NumElts = VT.getVectorNumElements();
11328 if (MaskSize == NumElts) {
11329 int NumLanes = VT.getSizeInBits() / 128;
11330 int NumEltsPerLane = NumElts / NumLanes;
11331 int NumHalfEltsPerLane = NumEltsPerLane / 2;
11332 bool SameLane =
11333 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
11334 bool SameElt =
11335 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
11336 return SameLane && SameElt;
11337 }
11338 }
11339 break;
11340 }
11341
11342 return false;
11343}
11344
11345/// Checks whether a shuffle mask is equivalent to an explicit list of
11346/// arguments.
11347///
11348/// This is a fast way to test a shuffle mask against a fixed pattern:
11349///
11350/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
11351///
11352/// It returns true if the mask is exactly as wide as the argument list, and
11353/// each element of the mask is either -1 (signifying undef) or the value given
11354/// in the argument.
11355static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
11356 SDValue V1 = SDValue(),
11357 SDValue V2 = SDValue()) {
11358 int Size = Mask.size();
11359 if (Size != (int)ExpectedMask.size())
11360 return false;
11361
11362 for (int i = 0; i < Size; ++i) {
11363 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11363, __extension__ __PRETTY_FUNCTION__))
;
11364 int MaskIdx = Mask[i];
11365 int ExpectedIdx = ExpectedMask[i];
11366 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
11367 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11368 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11369 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11370 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11371 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11372 return false;
11373 }
11374 }
11375 return true;
11376}
11377
11378/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
11379///
11380/// The masks must be exactly the same width.
11381///
11382/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
11383/// value in ExpectedMask is always accepted. Otherwise the indices must match.
11384///
11385/// SM_SentinelZero is accepted as a valid negative index but must match in
11386/// both.
11387static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,
11388 ArrayRef<int> ExpectedMask,
11389 SDValue V1 = SDValue(),
11390 SDValue V2 = SDValue()) {
11391 int Size = Mask.size();
11392 if (Size != (int)ExpectedMask.size())
11393 return false;
11394 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&(static_cast <bool> (isUndefOrZeroOrInRange(ExpectedMask
, 0, 2 * Size) && "Illegal target shuffle mask") ? void
(0) : __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11395, __extension__ __PRETTY_FUNCTION__))
11395 "Illegal target shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(ExpectedMask
, 0, 2 * Size) && "Illegal target shuffle mask") ? void
(0) : __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11395, __extension__ __PRETTY_FUNCTION__))
;
11396
11397 // Check for out-of-range target shuffle mask indices.
11398 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
11399 return false;
11400
11401 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
11402 if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())
11403 V1 = SDValue();
11404 if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())
11405 V2 = SDValue();
11406
11407 for (int i = 0; i < Size; ++i) {
11408 int MaskIdx = Mask[i];
11409 int ExpectedIdx = ExpectedMask[i];
11410 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
11411 continue;
11412 if (0 <= MaskIdx && 0 <= ExpectedIdx) {
11413 SDValue MaskV = MaskIdx < Size ? V1 : V2;
11414 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
11415 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
11416 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
11417 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
11418 continue;
11419 }
11420 // TODO - handle SM_Sentinel equivalences.
11421 return false;
11422 }
11423 return true;
11424}
11425
11426// Attempt to create a shuffle mask from a VSELECT condition mask.
11427static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
11428 SDValue Cond) {
11429 EVT CondVT = Cond.getValueType();
11430 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
11431 unsigned NumElts = CondVT.getVectorNumElements();
11432
11433 APInt UndefElts;
11434 SmallVector<APInt, 32> EltBits;
11435 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
11436 true, false))
11437 return false;
11438
11439 Mask.resize(NumElts, SM_SentinelUndef);
11440
11441 for (int i = 0; i != (int)NumElts; ++i) {
11442 Mask[i] = i;
11443 // Arbitrarily choose from the 2nd operand if the select condition element
11444 // is undef.
11445 // TODO: Can we do better by matching patterns such as even/odd?
11446 if (UndefElts[i] || EltBits[i].isNullValue())
11447 Mask[i] += NumElts;
11448 }
11449
11450 return true;
11451}
11452
11453// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
11454// instructions.
11455static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
11456 if (VT != MVT::v8i32 && VT != MVT::v8f32)
11457 return false;
11458
11459 SmallVector<int, 8> Unpcklwd;
11460 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
11461 /* Unary = */ false);
11462 SmallVector<int, 8> Unpckhwd;
11463 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
11464 /* Unary = */ false);
11465 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd) ||
11466 isTargetShuffleEquivalent(VT, Mask, Unpckhwd));
11467 return IsUnpackwdMask;
11468}
11469
11470static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
11471 // Create 128-bit vector type based on mask size.
11472 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
11473 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
11474
11475 // We can't assume a canonical shuffle mask, so try the commuted version too.
11476 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11477 ShuffleVectorSDNode::commuteMask(CommutedMask);
11478
11479 // Match any of unary/binary or low/high.
11480 for (unsigned i = 0; i != 4; ++i) {
11481 SmallVector<int, 16> UnpackMask;
11482 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
11483 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask) ||
11484 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask))
11485 return true;
11486 }
11487 return false;
11488}
11489
11490/// Return true if a shuffle mask chooses elements identically in its top and
11491/// bottom halves. For example, any splat mask has the same top and bottom
11492/// halves. If an element is undefined in only one half of the mask, the halves
11493/// are not considered identical.
11494static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
11495 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11495, __extension__ __PRETTY_FUNCTION__))
;
11496 unsigned HalfSize = Mask.size() / 2;
11497 for (unsigned i = 0; i != HalfSize; ++i) {
11498 if (Mask[i] != Mask[i + HalfSize])
11499 return false;
11500 }
11501 return true;
11502}
11503
11504/// Get a 4-lane 8-bit shuffle immediate for a mask.
11505///
11506/// This helper function produces an 8-bit shuffle immediate corresponding to
11507/// the ubiquitous shuffle encoding scheme used in x86 instructions for
11508/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
11509/// example.
11510///
11511/// NB: We rely heavily on "undef" masks preserving the input lane.
11512static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
11513 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11513, __extension__ __PRETTY_FUNCTION__))
;
11514 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11514, __extension__ __PRETTY_FUNCTION__))
;
11515 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11515, __extension__ __PRETTY_FUNCTION__))
;
11516 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11516, __extension__ __PRETTY_FUNCTION__))
;
11517 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11517, __extension__ __PRETTY_FUNCTION__))
;
11518
11519 // If the mask only uses one non-undef element, then fully 'splat' it to
11520 // improve later broadcast matching.
11521 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
11522 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11522, __extension__ __PRETTY_FUNCTION__))
;
11523
11524 int FirstElt = Mask[FirstIndex];
11525 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
11526 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
11527
11528 unsigned Imm = 0;
11529 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
11530 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
11531 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
11532 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
11533 return Imm;
11534}
11535
11536static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
11537 SelectionDAG &DAG) {
11538 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
11539}
11540
11541// The Shuffle result is as follow:
11542// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
11543// Each Zeroable's element correspond to a particular Mask's element.
11544// As described in computeZeroableShuffleElements function.
11545//
11546// The function looks for a sub-mask that the nonzero elements are in
11547// increasing order. If such sub-mask exist. The function returns true.
11548static bool isNonZeroElementsInOrder(const APInt &Zeroable,
11549 ArrayRef<int> Mask, const EVT &VectorType,
11550 bool &IsZeroSideLeft) {
11551 int NextElement = -1;
11552 // Check if the Mask's nonzero elements are in increasing order.
11553 for (int i = 0, e = Mask.size(); i < e; i++) {
11554 // Checks if the mask's zeros elements are built from only zeros.
11555 assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11555, __extension__ __PRETTY_FUNCTION__))
;
11556 if (Mask[i] < 0)
11557 return false;
11558 if (Zeroable[i])
11559 continue;
11560 // Find the lowest non zero element
11561 if (NextElement < 0) {
11562 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
11563 IsZeroSideLeft = NextElement != 0;
11564 }
11565 // Exit if the mask's non zero elements are not in increasing order.
11566 if (NextElement != Mask[i])
11567 return false;
11568 NextElement++;
11569 }
11570 return true;
11571}
11572
11573/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
11574static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
11575 ArrayRef<int> Mask, SDValue V1,
11576 SDValue V2, const APInt &Zeroable,
11577 const X86Subtarget &Subtarget,
11578 SelectionDAG &DAG) {
11579 int Size = Mask.size();
11580 int LaneSize = 128 / VT.getScalarSizeInBits();
11581 const int NumBytes = VT.getSizeInBits() / 8;
11582 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
11583
11584 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11586, __extension__ __PRETTY_FUNCTION__))
11585 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11586, __extension__ __PRETTY_FUNCTION__))
11586 (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11586, __extension__ __PRETTY_FUNCTION__))
;
11587
11588 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
11589 // Sign bit set in i8 mask means zero element.
11590 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
11591
11592 SDValue V;
11593 for (int i = 0; i < NumBytes; ++i) {
11594 int M = Mask[i / NumEltBytes];
11595 if (M < 0) {
11596 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
11597 continue;
11598 }
11599 if (Zeroable[i / NumEltBytes]) {
11600 PSHUFBMask[i] = ZeroMask;
11601 continue;
11602 }
11603
11604 // We can only use a single input of V1 or V2.
11605 SDValue SrcV = (M >= Size ? V2 : V1);
11606 if (V && V != SrcV)
11607 return SDValue();
11608 V = SrcV;
11609 M %= Size;
11610
11611 // PSHUFB can't cross lanes, ensure this doesn't happen.
11612 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
11613 return SDValue();
11614
11615 M = M % LaneSize;
11616 M = M * NumEltBytes + (i % NumEltBytes);
11617 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
11618 }
11619 assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11619, __extension__ __PRETTY_FUNCTION__))
;
11620
11621 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
11622 return DAG.getBitcast(
11623 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
11624 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
11625}
11626
11627static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
11628 const X86Subtarget &Subtarget, SelectionDAG &DAG,
11629 const SDLoc &dl);
11630
11631// X86 has dedicated shuffle that can be lowered to VEXPAND
11632static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
11633 const APInt &Zeroable,
11634 ArrayRef<int> Mask, SDValue &V1,
11635 SDValue &V2, SelectionDAG &DAG,
11636 const X86Subtarget &Subtarget) {
11637 bool IsLeftZeroSide = true;
11638 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
11639 IsLeftZeroSide))
11640 return SDValue();
11641 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
11642 MVT IntegerType =
11643 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11644 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
11645 unsigned NumElts = VT.getVectorNumElements();
11646 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11647, __extension__ __PRETTY_FUNCTION__))
11647 "Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11647, __extension__ __PRETTY_FUNCTION__))
;
11648 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
11649 Subtarget, DAG, DL);
11650 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
11651 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
11652 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
11653}
11654
11655static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
11656 unsigned &UnpackOpcode, bool IsUnary,
11657 ArrayRef<int> TargetMask, const SDLoc &DL,
11658 SelectionDAG &DAG,
11659 const X86Subtarget &Subtarget) {
11660 int NumElts = VT.getVectorNumElements();
11661
11662 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
11663 for (int i = 0; i != NumElts; i += 2) {
11664 int M1 = TargetMask[i + 0];
11665 int M2 = TargetMask[i + 1];
11666 Undef1 &= (SM_SentinelUndef == M1);
11667 Undef2 &= (SM_SentinelUndef == M2);
11668 Zero1 &= isUndefOrZero(M1);
11669 Zero2 &= isUndefOrZero(M2);
11670 }
11671 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11672, __extension__ __PRETTY_FUNCTION__))
11672 "Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11672, __extension__ __PRETTY_FUNCTION__))
;
11673
11674 // Attempt to match the target mask against the unpack lo/hi mask patterns.
11675 SmallVector<int, 64> Unpckl, Unpckh;
11676 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
11677 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, V1,
11678 (IsUnary ? V1 : V2))) {
11679 UnpackOpcode = X86ISD::UNPCKL;
11680 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11681 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11682 return true;
11683 }
11684
11685 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
11686 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, V1,
11687 (IsUnary ? V1 : V2))) {
11688 UnpackOpcode = X86ISD::UNPCKH;
11689 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
11690 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
11691 return true;
11692 }
11693
11694 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
11695 if (IsUnary && (Zero1 || Zero2)) {
11696 // Don't bother if we can blend instead.
11697 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11698 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
11699 return false;
11700
11701 bool MatchLo = true, MatchHi = true;
11702 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11703 int M = TargetMask[i];
11704
11705 // Ignore if the input is known to be zero or the index is undef.
11706 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11707 (M == SM_SentinelUndef))
11708 continue;
11709
11710 MatchLo &= (M == Unpckl[i]);
11711 MatchHi &= (M == Unpckh[i]);
11712 }
11713
11714 if (MatchLo || MatchHi) {
11715 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11716 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11717 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
11718 return true;
11719 }
11720 }
11721
11722 // If a binary shuffle, commute and try again.
11723 if (!IsUnary) {
11724 ShuffleVectorSDNode::commuteMask(Unpckl);
11725 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl)) {
11726 UnpackOpcode = X86ISD::UNPCKL;
11727 std::swap(V1, V2);
11728 return true;
11729 }
11730
11731 ShuffleVectorSDNode::commuteMask(Unpckh);
11732 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh)) {
11733 UnpackOpcode = X86ISD::UNPCKH;
11734 std::swap(V1, V2);
11735 return true;
11736 }
11737 }
11738
11739 return false;
11740}
11741
11742// X86 has dedicated unpack instructions that can handle specific blend
11743// operations: UNPCKH and UNPCKL.
11744static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
11745 ArrayRef<int> Mask, SDValue V1, SDValue V2,
11746 SelectionDAG &DAG) {
11747 SmallVector<int, 8> Unpckl;
11748 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11749 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11750 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11751
11752 SmallVector<int, 8> Unpckh;
11753 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11754 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11755 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11756
11757 // Commute and try again.
11758 ShuffleVectorSDNode::commuteMask(Unpckl);
11759 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11760 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11761
11762 ShuffleVectorSDNode::commuteMask(Unpckh);
11763 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11764 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11765
11766 return SDValue();
11767}
11768
11769/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11770/// followed by unpack 256-bit.
11771static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11772 ArrayRef<int> Mask, SDValue V1,
11773 SDValue V2, SelectionDAG &DAG) {
11774 SmallVector<int, 32> Unpckl, Unpckh;
11775 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11776 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11777
11778 unsigned UnpackOpcode;
11779 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
11780 UnpackOpcode = X86ISD::UNPCKL;
11781 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
11782 UnpackOpcode = X86ISD::UNPCKH;
11783 else
11784 return SDValue();
11785
11786 // This is a "natural" unpack operation (rather than the 128-bit sectored
11787 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11788 // input in order to use the x86 instruction.
11789 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11790 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11791 V1 = DAG.getBitcast(VT, V1);
11792 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11793}
11794
11795// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
11796// source into the lower elements and zeroing the upper elements.
11797static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
11798 ArrayRef<int> Mask, const APInt &Zeroable,
11799 const X86Subtarget &Subtarget) {
11800 if (!VT.is512BitVector() && !Subtarget.hasVLX())
11801 return false;
11802
11803 unsigned NumElts = Mask.size();
11804 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11805 unsigned MaxScale = 64 / EltSizeInBits;
11806
11807 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11808 unsigned SrcEltBits = EltSizeInBits * Scale;
11809 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11810 continue;
11811 unsigned NumSrcElts = NumElts / Scale;
11812 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
11813 continue;
11814 unsigned UpperElts = NumElts - NumSrcElts;
11815 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11816 continue;
11817 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
11818 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
11819 DstVT = MVT::getIntegerVT(EltSizeInBits);
11820 if ((NumSrcElts * EltSizeInBits) >= 128) {
11821 // ISD::TRUNCATE
11822 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
11823 } else {
11824 // X86ISD::VTRUNC
11825 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
11826 }
11827 return true;
11828 }
11829
11830 return false;
11831}
11832
11833// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
11834// element padding to the final DstVT.
11835static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
11836 const X86Subtarget &Subtarget,
11837 SelectionDAG &DAG, bool ZeroUppers) {
11838 MVT SrcVT = Src.getSimpleValueType();
11839 MVT DstSVT = DstVT.getScalarType();
11840 unsigned NumDstElts = DstVT.getVectorNumElements();
11841 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11842 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
11843
11844 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
11845 return SDValue();
11846
11847 // Perform a direct ISD::TRUNCATE if possible.
11848 if (NumSrcElts == NumDstElts)
11849 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
11850
11851 if (NumSrcElts > NumDstElts) {
11852 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11853 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11854 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
11855 }
11856
11857 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11858 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
11859 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
11860 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11861 DstVT.getSizeInBits());
11862 }
11863
11864 // Non-VLX targets must truncate from a 512-bit type, so we need to
11865 // widen, truncate and then possibly extract the original subvector.
11866 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
11867 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
11868 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
11869 }
11870
11871 // Fallback to a X86ISD::VTRUNC, padding if necessary.
11872 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
11873 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
11874 if (DstVT != TruncVT)
11875 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
11876 DstVT.getSizeInBits());
11877 return Trunc;
11878}
11879
11880// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11881//
11882// An example is the following:
11883//
11884// t0: ch = EntryToken
11885// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11886// t25: v4i32 = truncate t2
11887// t41: v8i16 = bitcast t25
11888// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11889// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11890// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11891// t18: v2i64 = bitcast t51
11892//
11893// One can just use a single vpmovdw instruction, without avx512vl we need to
11894// use the zmm variant and extract the lower subvector, padding with zeroes.
11895// TODO: Merge with lowerShuffleAsVTRUNC.
11896static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
11897 SDValue V2, ArrayRef<int> Mask,
11898 const APInt &Zeroable,
11899 const X86Subtarget &Subtarget,
11900 SelectionDAG &DAG) {
11901 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11901, __extension__ __PRETTY_FUNCTION__))
;
11902 if (!Subtarget.hasAVX512())
11903 return SDValue();
11904
11905 unsigned NumElts = VT.getVectorNumElements();
11906 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11907 unsigned MaxScale = 64 / EltSizeInBits;
11908 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11909 unsigned NumSrcElts = NumElts / Scale;
11910 unsigned UpperElts = NumElts - NumSrcElts;
11911 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11912 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11913 continue;
11914
11915 SDValue Src = V1;
11916 if (!Src.hasOneUse())
11917 return SDValue();
11918
11919 Src = peekThroughOneUseBitcasts(Src);
11920 if (Src.getOpcode() != ISD::TRUNCATE ||
11921 Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
11922 return SDValue();
11923 Src = Src.getOperand(0);
11924
11925 // VPMOVWB is only available with avx512bw.
11926 MVT SrcVT = Src.getSimpleValueType();
11927 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11928 !Subtarget.hasBWI())
11929 return SDValue();
11930
11931 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
11932 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11933 }
11934
11935 return SDValue();
11936}
11937
11938// Attempt to match binary shuffle patterns as a truncate.
11939static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
11940 SDValue V2, ArrayRef<int> Mask,
11941 const APInt &Zeroable,
11942 const X86Subtarget &Subtarget,
11943 SelectionDAG &DAG) {
11944 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11945, __extension__ __PRETTY_FUNCTION__))
11945 "Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11945, __extension__ __PRETTY_FUNCTION__))
;
11946 if (!Subtarget.hasAVX512())
11947 return SDValue();
11948
11949 unsigned NumElts = VT.getVectorNumElements();
11950 unsigned EltSizeInBits = VT.getScalarSizeInBits();
11951 unsigned MaxScale = 64 / EltSizeInBits;
11952 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11953 // TODO: Support non-BWI VPMOVWB truncations?
11954 unsigned SrcEltBits = EltSizeInBits * Scale;
11955 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11956 continue;
11957
11958 // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
11959 // Bail if the V2 elements are undef.
11960 unsigned NumHalfSrcElts = NumElts / Scale;
11961 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11962 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
11963 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
11964 continue;
11965
11966 // The elements beyond the truncation must be undef/zero.
11967 unsigned UpperElts = NumElts - NumSrcElts;
11968 if (UpperElts > 0 &&
11969 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
11970 continue;
11971 bool UndefUppers =
11972 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
11973
11974 // As we're using both sources then we need to concat them together
11975 // and truncate from the double-sized src.
11976 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
11977 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
11978
11979 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
11980 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
11981 Src = DAG.getBitcast(SrcVT, Src);
11982 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
11983 }
11984
11985 return SDValue();
11986}
11987
11988/// Check whether a compaction lowering can be done by dropping even
11989/// elements and compute how many times even elements must be dropped.
11990///
11991/// This handles shuffles which take every Nth element where N is a power of
11992/// two. Example shuffle masks:
11993///
11994/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11995/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11996/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11997/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11998/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11999/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12000///
12001/// Any of these lanes can of course be undef.
12002///
12003/// This routine only supports N <= 3.
12004/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12005/// for larger N.
12006///
12007/// \returns N above, or the number of times even elements must be dropped if
12008/// there is such a number. Otherwise returns zero.
12009static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
12010 bool IsSingleInput) {
12011 // The modulus for the shuffle vector entries is based on whether this is
12012 // a single input or not.
12013 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12014 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12015, __extension__ __PRETTY_FUNCTION__))
12015 "We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12015, __extension__ __PRETTY_FUNCTION__))
;
12016
12017 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12018
12019 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12020 // and 2^3 simultaneously. This is because we may have ambiguity with
12021 // partially undef inputs.
12022 bool ViableForN[3] = {true, true, true};
12023
12024 for (int i = 0, e = Mask.size(); i < e; ++i) {
12025 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12026 // want.
12027 if (Mask[i] < 0)
12028 continue;
12029
12030 bool IsAnyViable = false;
12031 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12032 if (ViableForN[j]) {
12033 uint64_t N = j + 1;
12034
12035 // The shuffle mask must be equal to (i * 2^N) % M.
12036 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12037 IsAnyViable = true;
12038 else
12039 ViableForN[j] = false;
12040 }
12041 // Early exit if we exhaust the possible powers of two.
12042 if (!IsAnyViable)
12043 break;
12044 }
12045
12046 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12047 if (ViableForN[j])
12048 return j + 1;
12049
12050 // Return 0 as there is no viable power of two.
12051 return 0;
12052}
12053
12054// X86 has dedicated pack instructions that can handle specific truncation
12055// operations: PACKSS and PACKUS.
12056// Checks for compaction shuffle masks if MaxStages > 1.
12057// TODO: Add support for matching multiple PACKSS/PACKUS stages.
12058static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
12059 unsigned &PackOpcode, ArrayRef<int> TargetMask,
12060 const SelectionDAG &DAG,
12061 const X86Subtarget &Subtarget,
12062 unsigned MaxStages = 1) {
12063 unsigned NumElts = VT.getVectorNumElements();
12064 unsigned BitSize = VT.getScalarSizeInBits();
12065 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12066, __extension__ __PRETTY_FUNCTION__))
12066 "Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12066, __extension__ __PRETTY_FUNCTION__))
;
12067
12068 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
12069 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
12070 unsigned NumPackedBits = NumSrcBits - BitSize;
12071 N1 = peekThroughBitcasts(N1);
12072 N2 = peekThroughBitcasts(N2);
12073 unsigned NumBits1 = N1.getScalarValueSizeInBits();
12074 unsigned NumBits2 = N2.getScalarValueSizeInBits();
12075 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
12076 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
12077 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
12078 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
12079 return false;
12080 if (Subtarget.hasSSE41() || BitSize == 8) {
12081 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
12082 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
12083 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
12084 V1 = N1;
12085 V2 = N2;
12086 SrcVT = PackVT;
12087 PackOpcode = X86ISD::PACKUS;
12088 return true;
12089 }
12090 }
12091 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
12092 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
12093 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
12094 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
12095 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
12096 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
12097 V1 = N1;
12098 V2 = N2;
12099 SrcVT = PackVT;
12100 PackOpcode = X86ISD::PACKSS;
12101 return true;
12102 }
12103 return false;
12104 };
12105
12106 // Attempt to match against wider and wider compaction patterns.
12107 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
12108 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
12109 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
12110
12111 // Try binary shuffle.
12112 SmallVector<int, 32> BinaryMask;
12113 createPackShuffleMask(VT, BinaryMask, false, NumStages);
12114 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, V1, V2))
12115 if (MatchPACK(V1, V2, PackVT))
12116 return true;
12117
12118 // Try unary shuffle.
12119 SmallVector<int, 32> UnaryMask;
12120 createPackShuffleMask(VT, UnaryMask, true, NumStages);
12121 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, V1))
12122 if (MatchPACK(V1, V1, PackVT))
12123 return true;
12124 }
12125
12126 return false;
12127}
12128
12129static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
12130 SDValue V1, SDValue V2, SelectionDAG &DAG,
12131 const X86Subtarget &Subtarget) {
12132 MVT PackVT;
12133 unsigned PackOpcode;
12134 unsigned SizeBits = VT.getSizeInBits();
12135 unsigned EltBits = VT.getScalarSizeInBits();
12136 unsigned MaxStages = Log2_32(64 / EltBits);
12137 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
12138 Subtarget, MaxStages))
12139 return SDValue();
12140
12141 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
12142 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
12143
12144 // Don't lower multi-stage packs on AVX512, truncation is better.
12145 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
12146 return SDValue();
12147
12148 // Pack to the largest type possible:
12149 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
12150 unsigned MaxPackBits = 16;
12151 if (CurrentEltBits > 16 &&
12152 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
12153 MaxPackBits = 32;
12154
12155 // Repeatedly pack down to the target size.
12156 SDValue Res;
12157 for (unsigned i = 0; i != NumStages; ++i) {
12158 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
12159 unsigned NumSrcElts = SizeBits / SrcEltBits;
12160 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12161 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
12162 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12163 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
12164 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
12165 DAG.getBitcast(SrcVT, V2));
12166 V1 = V2 = Res;
12167 CurrentEltBits /= 2;
12168 }
12169 assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12170, __extension__ __PRETTY_FUNCTION__))
12170 "Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12170, __extension__ __PRETTY_FUNCTION__))
;
12171 return Res;
12172}
12173
12174/// Try to emit a bitmask instruction for a shuffle.
12175///
12176/// This handles cases where we can model a blend exactly as a bitmask due to
12177/// one of the inputs being zeroable.
12178static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
12179 SDValue V2, ArrayRef<int> Mask,
12180 const APInt &Zeroable,
12181 const X86Subtarget &Subtarget,
12182 SelectionDAG &DAG) {
12183 MVT MaskVT = VT;
12184 MVT EltVT = VT.getVectorElementType();
12185 SDValue Zero, AllOnes;
12186 // Use f64 if i64 isn't legal.
12187 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
12188 EltVT = MVT::f64;
12189 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
12190 }
12191
12192 MVT LogicVT = VT;
12193 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
12194 Zero = DAG.getConstantFP(0.0, DL, EltVT);
12195 APFloat AllOnesValue = APFloat::getAllOnesValue(
12196 SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
12197 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
12198 LogicVT =
12199 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
12200 } else {
12201 Zero = DAG.getConstant(0, DL, EltVT);
12202 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12203 }
12204
12205 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
12206 SDValue V;
12207 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12208 if (Zeroable[i])
12209 continue;
12210 if (Mask[i] % Size != i)
12211 return SDValue(); // Not a blend.
12212 if (!V)
12213 V = Mask[i] < Size ? V1 : V2;
12214 else if (V != (Mask[i] < Size ? V1 : V2))
12215 return SDValue(); // Can only let one input through the mask.
12216
12217 VMaskOps[i] = AllOnes;
12218 }
12219 if (!V)
12220 return SDValue(); // No non-zeroable elements!
12221
12222 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
12223 VMask = DAG.getBitcast(LogicVT, VMask);
12224 V = DAG.getBitcast(LogicVT, V);
12225 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
12226 return DAG.getBitcast(VT, And);
12227}
12228
12229/// Try to emit a blend instruction for a shuffle using bit math.
12230///
12231/// This is used as a fallback approach when first class blend instructions are
12232/// unavailable. Currently it is only suitable for integer vectors, but could
12233/// be generalized for floating point vectors if desirable.
12234static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
12235 SDValue V2, ArrayRef<int> Mask,
12236 SelectionDAG &DAG) {
12237 assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12237, __extension__ __PRETTY_FUNCTION__))
;
12238 MVT EltVT = VT.getVectorElementType();
12239 SDValue Zero = DAG.getConstant(0, DL, EltVT);
12240 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
12241 SmallVector<SDValue, 16> MaskOps;
12242 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12243 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
12244 return SDValue(); // Shuffled input!
12245 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
12246 }
12247
12248 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
12249 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
12250 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
12251 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12252}
12253
12254static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
12255 SDValue PreservedSrc,
12256 const X86Subtarget &Subtarget,
12257 SelectionDAG &DAG);
12258
12259static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
12260 MutableArrayRef<int> Mask,
12261 const APInt &Zeroable, bool &ForceV1Zero,
12262 bool &ForceV2Zero, uint64_t &BlendMask) {
12263 bool V1IsZeroOrUndef =
12264 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
12265 bool V2IsZeroOrUndef =
12266 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
12267
12268 BlendMask = 0;
12269 ForceV1Zero = false, ForceV2Zero = false;
12270 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12270, __extension__ __PRETTY_FUNCTION__))
;
12271
12272 // Attempt to generate the binary blend mask. If an input is zero then
12273 // we can use any lane.
12274 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12275 int M = Mask[i];
12276 if (M == SM_SentinelUndef)
12277 continue;
12278 if (M == i ||
12279 (0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {
12280 Mask[i] = i;
12281 continue;
12282 }
12283 if (M == (i + Size) ||
12284 (Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {
12285 BlendMask |= 1ull << i;
12286 Mask[i] = i + Size;
12287 continue;
12288 }
12289 if (Zeroable[i]) {
12290 if (V1IsZeroOrUndef) {
12291 ForceV1Zero = true;
12292 Mask[i] = i;
12293 continue;
12294 }
12295 if (V2IsZeroOrUndef) {
12296 ForceV2Zero = true;
12297 BlendMask |= 1ull << i;
12298 Mask[i] = i + Size;
12299 continue;
12300 }
12301 }
12302 return false;
12303 }
12304 return true;
12305}
12306
12307static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
12308 int Scale) {
12309 uint64_t ScaledMask = 0;
12310 for (int i = 0; i != Size; ++i)
12311 if (BlendMask & (1ull << i))
12312 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
12313 return ScaledMask;
12314}
12315
12316/// Try to emit a blend instruction for a shuffle.
12317///
12318/// This doesn't do any checks for the availability of instructions for blending
12319/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
12320/// be matched in the backend with the type given. What it does check for is
12321/// that the shuffle mask is a blend, or convertible into a blend with zero.
12322static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
12323 SDValue V2, ArrayRef<int> Original,
12324 const APInt &Zeroable,
12325 const X86Subtarget &Subtarget,
12326 SelectionDAG &DAG) {
12327 uint64_t BlendMask = 0;
12328 bool ForceV1Zero = false, ForceV2Zero = false;
12329 SmallVector<int, 64> Mask(Original.begin(), Original.end());
12330 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
12331 BlendMask))
12332 return SDValue();
12333
12334 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
12335 if (ForceV1Zero)
12336 V1 = getZeroVector(VT, Subtarget, DAG, DL);
12337 if (ForceV2Zero)
12338 V2 = getZeroVector(VT, Subtarget, DAG, DL);
12339
12340 switch (VT.SimpleTy) {
12341 case MVT::v4i64:
12342 case MVT::v8i32:
12343 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12343, __extension__ __PRETTY_FUNCTION__))
;
12344 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12345 case MVT::v4f64:
12346 case MVT::v8f32:
12347 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12347, __extension__ __PRETTY_FUNCTION__))
;
12348 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12349 case MVT::v2f64:
12350 case MVT::v2i64:
12351 case MVT::v4f32:
12352 case MVT::v4i32:
12353 case MVT::v8i16:
12354 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12354, __extension__ __PRETTY_FUNCTION__))
;
12355 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
12356 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12357 case MVT::v16i16: {
12358 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12358, __extension__ __PRETTY_FUNCTION__))
;
12359 SmallVector<int, 8> RepeatedMask;
12360 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12361 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
12362 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12362, __extension__ __PRETTY_FUNCTION__))
;
12363 BlendMask = 0;
12364 for (int i = 0; i < 8; ++i)
12365 if (RepeatedMask[i] >= 8)
12366 BlendMask |= 1ull << i;
12367 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12368 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
12369 }
12370 // Use PBLENDW for lower/upper lanes and then blend lanes.
12371 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
12372 // merge to VSELECT where useful.
12373 uint64_t LoMask = BlendMask & 0xFF;
12374 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
12375 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
12376 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12377 DAG.getTargetConstant(LoMask, DL, MVT::i8));
12378 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
12379 DAG.getTargetConstant(HiMask, DL, MVT::i8));
12380 return DAG.getVectorShuffle(
12381 MVT::v16i16, DL, Lo, Hi,
12382 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
12383 }
12384 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12385 }
12386 case MVT::v32i8:
12387 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12387, __extension__ __PRETTY_FUNCTION__))
;
12388 LLVM_FALLTHROUGH[[gnu::fallthrough]];
12389 case MVT::v16i8: {
12390 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12390, __extension__ __PRETTY_FUNCTION__))
;
12391
12392 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
12393 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12394 Subtarget, DAG))
12395 return Masked;
12396
12397 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
12398 MVT IntegerType =
12399 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12400 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12401 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12402 }
12403
12404 // If we have VPTERNLOG, we can use that as a bit blend.
12405 if (Subtarget.hasVLX())
12406 if (SDValue BitBlend =
12407 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12408 return BitBlend;
12409
12410 // Scale the blend by the number of bytes per element.
12411 int Scale = VT.getScalarSizeInBits() / 8;
12412
12413 // This form of blend is always done on bytes. Compute the byte vector
12414 // type.
12415 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12416
12417 // x86 allows load folding with blendvb from the 2nd source operand. But
12418 // we are still using LLVM select here (see comment below), so that's V1.
12419 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
12420 // allow that load-folding possibility.
12421 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
12422 ShuffleVectorSDNode::commuteMask(Mask);
12423 std::swap(V1, V2);
12424 }
12425
12426 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
12427 // mix of LLVM's code generator and the x86 backend. We tell the code
12428 // generator that boolean values in the elements of an x86 vector register
12429 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
12430 // mapping a select to operand #1, and 'false' mapping to operand #2. The
12431 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
12432 // of the element (the remaining are ignored) and 0 in that high bit would
12433 // mean operand #1 while 1 in the high bit would mean operand #2. So while
12434 // the LLVM model for boolean values in vector elements gets the relevant
12435 // bit set, it is set backwards and over constrained relative to x86's
12436 // actual model.
12437 SmallVector<SDValue, 32> VSELECTMask;
12438 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12439 for (int j = 0; j < Scale; ++j)
12440 VSELECTMask.push_back(
12441 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
12442 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
12443 MVT::i8));
12444
12445 V1 = DAG.getBitcast(BlendVT, V1);
12446 V2 = DAG.getBitcast(BlendVT, V2);
12447 return DAG.getBitcast(
12448 VT,
12449 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
12450 V1, V2));
12451 }
12452 case MVT::v16f32:
12453 case MVT::v8f64:
12454 case MVT::v8i64:
12455 case MVT::v16i32:
12456 case MVT::v32i16:
12457 case MVT::v64i8: {
12458 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
12459 bool OptForSize = DAG.shouldOptForSize();
12460 if (!OptForSize) {
12461 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
12462 Subtarget, DAG))
12463 return Masked;
12464 }
12465
12466 // Otherwise load an immediate into a GPR, cast to k-register, and use a
12467 // masked move.
12468 MVT IntegerType =
12469 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
12470 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
12471 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
12472 }
12473 default:
12474 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12474)
;
12475 }
12476}
12477
12478/// Try to lower as a blend of elements from two inputs followed by
12479/// a single-input permutation.
12480///
12481/// This matches the pattern where we can blend elements from two inputs and
12482/// then reduce the shuffle to a single-input permutation.
12483static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
12484 SDValue V1, SDValue V2,
12485 ArrayRef<int> Mask,
12486 SelectionDAG &DAG,
12487 bool ImmBlends = false) {
12488 // We build up the blend mask while checking whether a blend is a viable way
12489 // to reduce the shuffle.
12490 SmallVector<int, 32> BlendMask(Mask.size(), -1);
12491 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
12492
12493 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
12494 if (Mask[i] < 0)
12495 continue;
12496
12497 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12497, __extension__ __PRETTY_FUNCTION__))
;
12498
12499 if (BlendMask[Mask[i] % Size] < 0)
12500 BlendMask[Mask[i] % Size] = Mask[i];
12501 else if (BlendMask[Mask[i] % Size] != Mask[i])
12502 return SDValue(); // Can't blend in the needed input!
12503
12504 PermuteMask[i] = Mask[i] % Size;
12505 }
12506
12507 // If only immediate blends, then bail if the blend mask can't be widened to
12508 // i16.
12509 unsigned EltSize = VT.getScalarSizeInBits();
12510 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
12511 return SDValue();
12512
12513 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
12514 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
12515}
12516
12517/// Try to lower as an unpack of elements from two inputs followed by
12518/// a single-input permutation.
12519///
12520/// This matches the pattern where we can unpack elements from two inputs and
12521/// then reduce the shuffle to a single-input (wider) permutation.
12522static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
12523 SDValue V1, SDValue V2,
12524 ArrayRef<int> Mask,
12525 SelectionDAG &DAG) {
12526 int NumElts = Mask.size();
12527 int NumLanes = VT.getSizeInBits() / 128;
12528 int NumLaneElts = NumElts / NumLanes;
12529 int NumHalfLaneElts = NumLaneElts / 2;
12530
12531 bool MatchLo = true, MatchHi = true;
12532 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12533
12534 // Determine UNPCKL/UNPCKH type and operand order.
12535 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12536 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
12537 int M = Mask[Lane + Elt];
12538 if (M < 0)
12539 continue;
12540
12541 SDValue &Op = Ops[Elt & 1];
12542 if (M < NumElts && (Op.isUndef() || Op == V1))
12543 Op = V1;
12544 else if (NumElts <= M && (Op.isUndef() || Op == V2))
12545 Op = V2;
12546 else
12547 return SDValue();
12548
12549 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
12550 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
12551 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
12552 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
12553 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
12554 if (!MatchLo && !MatchHi)
12555 return SDValue();
12556 }
12557 }
12558 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12558, __extension__ __PRETTY_FUNCTION__))
;
12559
12560 // Now check that each pair of elts come from the same unpack pair
12561 // and set the permute mask based on each pair.
12562 // TODO - Investigate cases where we permute individual elements.
12563 SmallVector<int, 32> PermuteMask(NumElts, -1);
12564 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
12565 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
12566 int M0 = Mask[Lane + Elt + 0];
12567 int M1 = Mask[Lane + Elt + 1];
12568 if (0 <= M0 && 0 <= M1 &&
12569 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
12570 return SDValue();
12571 if (0 <= M0)
12572 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
12573 if (0 <= M1)
12574 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
12575 }
12576 }
12577
12578 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
12579 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
12580 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
12581}
12582
12583/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
12584/// permuting the elements of the result in place.
12585static SDValue lowerShuffleAsByteRotateAndPermute(
12586 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12587 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12588 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
12589 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
12590 (VT.is512BitVector() && !Subtarget.hasBWI()))
12591 return SDValue();
12592
12593 // We don't currently support lane crossing permutes.
12594 if (is128BitLaneCrossingShuffleMask(VT, Mask))
12595 return SDValue();
12596
12597 int Scale = VT.getScalarSizeInBits() / 8;
12598 int NumLanes = VT.getSizeInBits() / 128;
12599 int NumElts = VT.getVectorNumElements();
12600 int NumEltsPerLane = NumElts / NumLanes;
12601
12602 // Determine range of mask elts.
12603 bool Blend1 = true;
12604 bool Blend2 = true;
12605 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12606 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
12607 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12608 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12609 int M = Mask[Lane + Elt];
12610 if (M < 0)
12611 continue;
12612 if (M < NumElts) {
12613 Blend1 &= (M == (Lane + Elt));
12614 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12614, __extension__ __PRETTY_FUNCTION__))
;
12615 M = M % NumEltsPerLane;
12616 Range1.first = std::min(Range1.first, M);
12617 Range1.second = std::max(Range1.second, M);
12618 } else {
12619 M -= NumElts;
12620 Blend2 &= (M == (Lane + Elt));
12621 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12621, __extension__ __PRETTY_FUNCTION__))
;
12622 M = M % NumEltsPerLane;
12623 Range2.first = std::min(Range2.first, M);
12624 Range2.second = std::max(Range2.second, M);
12625 }
12626 }
12627 }
12628
12629 // Bail if we don't need both elements.
12630 // TODO - it might be worth doing this for unary shuffles if the permute
12631 // can be widened.
12632 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12633 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12634 return SDValue();
12635
12636 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
12637 return SDValue();
12638
12639 // Rotate the 2 ops so we can access both ranges, then permute the result.
12640 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
12641 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12642 SDValue Rotate = DAG.getBitcast(
12643 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
12644 DAG.getBitcast(ByteVT, Lo),
12645 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
12646 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
12647 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12648 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12649 int M = Mask[Lane + Elt];
12650 if (M < 0)
12651 continue;
12652 if (M < NumElts)
12653 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12654 else
12655 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12656 }
12657 }
12658 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
12659 };
12660
12661 // Check if the ranges are small enough to rotate from either direction.
12662 if (Range2.second < Range1.first)
12663 return RotateAndPermute(V1, V2, Range1.first, 0);
12664 if (Range1.second < Range2.first)
12665 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12666 return SDValue();
12667}
12668
12669/// Generic routine to decompose a shuffle and blend into independent
12670/// blends and permutes.
12671///
12672/// This matches the extremely common pattern for handling combined
12673/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
12674/// operations. It will try to pick the best arrangement of shuffles and
12675/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
12676static SDValue lowerShuffleAsDecomposedShuffleMerge(
12677 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12678 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12679 int NumElts = Mask.size();
12680 int NumLanes = VT.getSizeInBits() / 128;
12681 int NumEltsPerLane = NumElts / NumLanes;
12682
12683 // Shuffle the input elements into the desired positions in V1 and V2 and
12684 // unpack/blend them together.
12685 bool IsAlternating = true;
12686 SmallVector<int, 32> V1Mask(NumElts, -1);
12687 SmallVector<int, 32> V2Mask(NumElts, -1);
12688 SmallVector<int, 32> FinalMask(NumElts, -1);
12689 for (int i = 0; i < NumElts; ++i) {
12690 int M = Mask[i];
12691 if (M >= 0 && M < NumElts) {
12692 V1Mask[i] = M;
12693 FinalMask[i] = i;
12694 IsAlternating &= (i & 1) == 0;
12695 } else if (M >= NumElts) {
12696 V2Mask[i] = M - NumElts;
12697 FinalMask[i] = i + NumElts;
12698 IsAlternating &= (i & 1) == 1;
12699 }
12700 }
12701
12702 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
12703 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
12704 // the shuffle may be able to fold with a load or other benefit. However, when
12705 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
12706 // pre-shuffle first is a better strategy.
12707 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
12708 // Only prefer immediate blends to unpack/rotate.
12709 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12710 DAG, true))
12711 return BlendPerm;
12712 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
12713 DAG))
12714 return UnpackPerm;
12715 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
12716 DL, VT, V1, V2, Mask, Subtarget, DAG))
12717 return RotatePerm;
12718 // Unpack/rotate failed - try again with variable blends.
12719 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
12720 DAG))
12721 return BlendPerm;
12722 }
12723
12724 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
12725 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
12726 // TODO: It doesn't have to be alternating - but each lane mustn't have more
12727 // than half the elements coming from each source.
12728 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
12729 V1Mask.assign(NumElts, -1);
12730 V2Mask.assign(NumElts, -1);
12731 FinalMask.assign(NumElts, -1);
12732 for (int i = 0; i != NumElts; i += NumEltsPerLane)
12733 for (int j = 0; j != NumEltsPerLane; ++j) {
12734 int M = Mask[i + j];
12735 if (M >= 0 && M < NumElts) {
12736 V1Mask[i + (j / 2)] = M;
12737 FinalMask[i + j] = i + (j / 2);
12738 } else if (M >= NumElts) {
12739 V2Mask[i + (j / 2)] = M - NumElts;
12740 FinalMask[i + j] = i + (j / 2) + NumElts;
12741 }
12742 }
12743 }
12744
12745 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
12746 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
12747 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
12748}
12749
12750/// Try to lower a vector shuffle as a bit rotation.
12751///
12752/// Look for a repeated rotation pattern in each sub group.
12753/// Returns a ISD::ROTL element rotation amount or -1 if failed.
12754static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
12755 int NumElts = Mask.size();
12756 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12756, __extension__ __PRETTY_FUNCTION__))
;
12757
12758 int RotateAmt = -1;
12759 for (int i = 0; i != NumElts; i += NumSubElts) {
12760 for (int j = 0; j != NumSubElts; ++j) {
12761 int M = Mask[i + j];
12762 if (M < 0)
12763 continue;
12764 if (!isInRange(M, i, i + NumSubElts))
12765 return -1;
12766 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
12767 if (0 <= RotateAmt && Offset != RotateAmt)
12768 return -1;
12769 RotateAmt = Offset;
12770 }
12771 }
12772 return RotateAmt;
12773}
12774
12775static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
12776 const X86Subtarget &Subtarget,
12777 ArrayRef<int> Mask) {
12778 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12778, __extension__ __PRETTY_FUNCTION__))
;
12779 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12779, __extension__ __PRETTY_FUNCTION__))
;
12780
12781 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
12782 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12783 int MaxSubElts = 64 / EltSizeInBits;
12784 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
12785 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
12786 if (RotateAmt < 0)
12787 continue;
12788
12789 int NumElts = Mask.size();
12790 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
12791 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
12792 return RotateAmt * EltSizeInBits;
12793 }
12794
12795 return -1;
12796}
12797
12798/// Lower shuffle using X86ISD::VROTLI rotations.
12799static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
12800 ArrayRef<int> Mask,
12801 const X86Subtarget &Subtarget,
12802 SelectionDAG &DAG) {
12803 // Only XOP + AVX512 targets have bit rotation instructions.
12804 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
12805 bool IsLegal =
12806 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
12807 if (!IsLegal && Subtarget.hasSSE3())
12808 return SDValue();
12809
12810 MVT RotateVT;
12811 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
12812 Subtarget, Mask);
12813 if (RotateAmt < 0)
12814 return SDValue();
12815
12816 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
12817 // expanded to OR(SRL,SHL), will be more efficient, but if they can
12818 // widen to vXi16 or more then existing lowering should will be better.
12819 if (!IsLegal) {
12820 if ((RotateAmt % 16) == 0)
12821 return SDValue();
12822 // TODO: Use getTargetVShiftByConstNode.
12823 unsigned ShlAmt = RotateAmt;
12824 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
12825 V1 = DAG.getBitcast(RotateVT, V1);
12826 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
12827 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
12828 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
12829 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
12830 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
12831 return DAG.getBitcast(VT, Rot);
12832 }
12833
12834 SDValue Rot =
12835 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
12836 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
12837 return DAG.getBitcast(VT, Rot);
12838}
12839
12840/// Try to match a vector shuffle as an element rotation.
12841///
12842/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
12843static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
12844 ArrayRef<int> Mask) {
12845 int NumElts = Mask.size();
12846
12847 // We need to detect various ways of spelling a rotation:
12848 // [11, 12, 13, 14, 15, 0, 1, 2]
12849 // [-1, 12, 13, 14, -1, -1, 1, -1]
12850 // [-1, -1, -1, -1, -1, -1, 1, 2]
12851 // [ 3, 4, 5, 6, 7, 8, 9, 10]
12852 // [-1, 4, 5, 6, -1, -1, 9, -1]
12853 // [-1, 4, 5, 6, -1, -1, -1, -1]
12854 int Rotation = 0;
12855 SDValue Lo, Hi;
12856 for (int i = 0; i < NumElts; ++i) {
12857 int M = Mask[i];
12858 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12859, __extension__ __PRETTY_FUNCTION__))
12859 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12859, __extension__ __PRETTY_FUNCTION__))
;
12860 if (M < 0)
12861 continue;
12862
12863 // Determine where a rotated vector would have started.
12864 int StartIdx = i - (M % NumElts);
12865 if (StartIdx == 0)
12866 // The identity rotation isn't interesting, stop.
12867 return -1;
12868
12869 // If we found the tail of a vector the rotation must be the missing
12870 // front. If we found the head of a vector, it must be how much of the
12871 // head.
12872 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12873
12874 if (Rotation == 0)
12875 Rotation = CandidateRotation;
12876 else if (Rotation != CandidateRotation)
12877 // The rotations don't match, so we can't match this mask.
12878 return -1;
12879
12880 // Compute which value this mask is pointing at.
12881 SDValue MaskV = M < NumElts ? V1 : V2;
12882
12883 // Compute which of the two target values this index should be assigned
12884 // to. This reflects whether the high elements are remaining or the low
12885 // elements are remaining.
12886 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
12887
12888 // Either set up this value if we've not encountered it before, or check
12889 // that it remains consistent.
12890 if (!TargetV)
12891 TargetV = MaskV;
12892 else if (TargetV != MaskV)
12893 // This may be a rotation, but it pulls from the inputs in some
12894 // unsupported interleaving.
12895 return -1;
12896 }
12897
12898 // Check that we successfully analyzed the mask, and normalize the results.
12899 assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12899, __extension__ __PRETTY_FUNCTION__))
;
12900 assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12900, __extension__ __PRETTY_FUNCTION__))
;
12901 if (!Lo)
12902 Lo = Hi;
12903 else if (!Hi)
12904 Hi = Lo;
12905
12906 V1 = Lo;
12907 V2 = Hi;
12908
12909 return Rotation;
12910}
12911
12912/// Try to lower a vector shuffle as a byte rotation.
12913///
12914/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
12915/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
12916/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
12917/// try to generically lower a vector shuffle through such an pattern. It
12918/// does not check for the profitability of lowering either as PALIGNR or
12919/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
12920/// This matches shuffle vectors that look like:
12921///
12922/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
12923///
12924/// Essentially it concatenates V1 and V2, shifts right by some number of
12925/// elements, and takes the low elements as the result. Note that while this is
12926/// specified as a *right shift* because x86 is little-endian, it is a *left
12927/// rotate* of the vector lanes.
12928static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
12929 ArrayRef<int> Mask) {
12930 // Don't accept any shuffles with zero elements.
12931 if (isAnyZero(Mask))
12932 return -1;
12933
12934 // PALIGNR works on 128-bit lanes.
12935 SmallVector<int, 16> RepeatedMask;
12936 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
12937 return -1;
12938
12939 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
12940 if (Rotation <= 0)
12941 return -1;
12942
12943 // PALIGNR rotates bytes, so we need to scale the
12944 // rotation based on how many bytes are in the vector lane.
12945 int NumElts = RepeatedMask.size();
12946 int Scale = 16 / NumElts;
12947 return Rotation * Scale;
12948}
12949
12950static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
12951 SDValue V2, ArrayRef<int> Mask,
12952 const X86Subtarget &Subtarget,
12953 SelectionDAG &DAG) {
12954 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12954, __extension__ __PRETTY_FUNCTION__))
;
12955
12956 SDValue Lo = V1, Hi = V2;
12957 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
12958 if (ByteRotation <= 0)
12959 return SDValue();
12960
12961 // Cast the inputs to i8 vector of correct length to match PALIGNR or
12962 // PSLLDQ/PSRLDQ.
12963 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
12964 Lo = DAG.getBitcast(ByteVT, Lo);
12965 Hi = DAG.getBitcast(ByteVT, Hi);
12966
12967 // SSSE3 targets can use the palignr instruction.
12968 if (Subtarget.hasSSSE3()) {
12969 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12970, __extension__ __PRETTY_FUNCTION__))
12970 "512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12970, __extension__ __PRETTY_FUNCTION__))
;
12971 return DAG.getBitcast(
12972 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12973 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12974 }
12975
12976 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12977, __extension__ __PRETTY_FUNCTION__))
12977 "Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12977, __extension__ __PRETTY_FUNCTION__))
;
12978 assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12979, __extension__ __PRETTY_FUNCTION__))
12979 "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12979, __extension__ __PRETTY_FUNCTION__))
;
12980 assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12981, __extension__ __PRETTY_FUNCTION__))
12981 "SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12981, __extension__ __PRETTY_FUNCTION__))
;
12982
12983 // Default SSE2 implementation
12984 int LoByteShift = 16 - ByteRotation;
12985 int HiByteShift = ByteRotation;
12986
12987 SDValue LoShift =
12988 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12989 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12990 SDValue HiShift =
12991 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12992 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12993 return DAG.getBitcast(VT,
12994 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12995}
12996
12997/// Try to lower a vector shuffle as a dword/qword rotation.
12998///
12999/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
13000/// rotation of the concatenation of two vectors; This routine will
13001/// try to generically lower a vector shuffle through such an pattern.
13002///
13003/// Essentially it concatenates V1 and V2, shifts right by some number of
13004/// elements, and takes the low elements as the result. Note that while this is
13005/// specified as a *right shift* because x86 is little-endian, it is a *left
13006/// rotate* of the vector lanes.
13007static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
13008 SDValue V2, ArrayRef<int> Mask,
13009 const X86Subtarget &Subtarget,
13010 SelectionDAG &DAG) {
13011 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13012, __extension__ __PRETTY_FUNCTION__))
13012 "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13012, __extension__ __PRETTY_FUNCTION__))
;
13013
13014 // 128/256-bit vectors are only supported with VLX.
13015 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13016, __extension__ __PRETTY_FUNCTION__))
13016 && "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13016, __extension__ __PRETTY_FUNCTION__))
;
13017
13018 SDValue Lo = V1, Hi = V2;
13019 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
13020 if (Rotation <= 0)
13021 return SDValue();
13022
13023 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
13024 DAG.getTargetConstant(Rotation, DL, MVT::i8));
13025}
13026
13027/// Try to lower a vector shuffle as a byte shift sequence.
13028static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
13029 SDValue V2, ArrayRef<int> Mask,
13030 const APInt &Zeroable,
13031 const X86Subtarget &Subtarget,
13032 SelectionDAG &DAG) {
13033 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13033, __extension__ __PRETTY_FUNCTION__))
;
13034 assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13034, __extension__ __PRETTY_FUNCTION__))
;
13035
13036 // We need a shuffle that has zeros at one/both ends and a sequential
13037 // shuffle from one source within.
13038 unsigned ZeroLo = Zeroable.countTrailingOnes();
13039 unsigned ZeroHi = Zeroable.countLeadingOnes();
13040 if (!ZeroLo && !ZeroHi)
13041 return SDValue();
13042
13043 unsigned NumElts = Mask.size();
13044 unsigned Len = NumElts - (ZeroLo + ZeroHi);
13045 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
13046 return SDValue();
13047
13048 unsigned Scale = VT.getScalarSizeInBits() / 8;
13049 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
13050 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
13051 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
13052 return SDValue();
13053
13054 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
13055 Res = DAG.getBitcast(MVT::v16i8, Res);
13056
13057 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
13058 // inner sequential set of elements, possibly offset:
13059 // 01234567 --> zzzzzz01 --> 1zzzzzzz
13060 // 01234567 --> 4567zzzz --> zzzzz456
13061 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
13062 if (ZeroLo == 0) {
13063 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13064 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13065 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13066 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13067 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
13068 } else if (ZeroHi == 0) {
13069 unsigned Shift = Mask[ZeroLo] % NumElts;
13070 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13071 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13072 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13073 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13074 } else if (!Subtarget.hasSSSE3()) {
13075 // If we don't have PSHUFB then its worth avoiding an AND constant mask
13076 // by performing 3 byte shifts. Shuffle combining can kick in above that.
13077 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
13078 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
13079 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13080 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13081 Shift += Mask[ZeroLo] % NumElts;
13082 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
13083 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
13084 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
13085 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
13086 } else
13087 return SDValue();
13088
13089 return DAG.getBitcast(VT, Res);
13090}
13091
13092/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
13093///
13094/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
13095/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
13096/// matches elements from one of the input vectors shuffled to the left or
13097/// right with zeroable elements 'shifted in'. It handles both the strictly
13098/// bit-wise element shifts and the byte shift across an entire 128-bit double
13099/// quad word lane.
13100///
13101/// PSHL : (little-endian) left bit shift.
13102/// [ zz, 0, zz, 2 ]
13103/// [ -1, 4, zz, -1 ]
13104/// PSRL : (little-endian) right bit shift.
13105/// [ 1, zz, 3, zz]
13106/// [ -1, -1, 7, zz]
13107/// PSLLDQ : (little-endian) left byte shift
13108/// [ zz, 0, 1, 2, 3, 4, 5, 6]
13109/// [ zz, zz, -1, -1, 2, 3, 4, -1]
13110/// [ zz, zz, zz, zz, zz, zz, -1, 1]
13111/// PSRLDQ : (little-endian) right byte shift
13112/// [ 5, 6, 7, zz, zz, zz, zz, zz]
13113/// [ -1, 5, 6, 7, zz, zz, zz, zz]
13114/// [ 1, 2, -1, -1, -1, -1, zz, zz]
13115static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
13116 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
13117 int MaskOffset, const APInt &Zeroable,
13118 const X86Subtarget &Subtarget) {
13119 int Size = Mask.size();
13120 unsigned SizeInBits = Size * ScalarSizeInBits;
13121
13122 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
13123 for (int i = 0; i < Size; i += Scale)
13124 for (int j = 0; j < Shift; ++j)
13125 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
13126 return false;
13127
13128 return true;
13129 };
13130
13131 auto MatchShift = [&](int Shift, int Scale, bool Left) {
13132 for (int i = 0; i != Size; i += Scale) {
13133 unsigned Pos = Left ? i + Shift : i;
13134 unsigned Low = Left ? i : i + Shift;
13135 unsigned Len = Scale - Shift;
13136 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
13137 return -1;
13138 }
13139
13140 int ShiftEltBits = ScalarSizeInBits * Scale;
13141 bool ByteShift = ShiftEltBits > 64;
13142 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
13143 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
13144 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
13145
13146 // Normalize the scale for byte shifts to still produce an i64 element
13147 // type.
13148 Scale = ByteShift ? Scale / 2 : Scale;
13149
13150 // We need to round trip through the appropriate type for the shift.
13151 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
13152 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
13153 : MVT::getVectorVT(ShiftSVT, Size / Scale);
13154 return (int)ShiftAmt;
13155 };
13156
13157 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
13158 // keep doubling the size of the integer elements up to that. We can
13159 // then shift the elements of the integer vector by whole multiples of
13160 // their width within the elements of the larger integer vector. Test each
13161 // multiple to see if we can find a match with the moved element indices
13162 // and that the shifted in elements are all zeroable.
13163 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
13164 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
13165 for (int Shift = 1; Shift != Scale; ++Shift)
13166 for (bool Left : {true, false})
13167 if (CheckZeros(Shift, Scale, Left)) {
13168 int ShiftAmt = MatchShift(Shift, Scale, Left);
13169 if (0 < ShiftAmt)
13170 return ShiftAmt;
13171 }
13172
13173 // no match
13174 return -1;
13175}
13176
13177static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
13178 SDValue V2, ArrayRef<int> Mask,
13179 const APInt &Zeroable,
13180 const X86Subtarget &Subtarget,
13181 SelectionDAG &DAG) {
13182 int Size = Mask.size();
13183 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13183, __extension__ __PRETTY_FUNCTION__))
;
13184
13185 MVT ShiftVT;
13186 SDValue V = V1;
13187 unsigned Opcode;
13188
13189 // Try to match shuffle against V1 shift.
13190 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
13191 Mask, 0, Zeroable, Subtarget);
13192
13193 // If V1 failed, try to match shuffle against V2 shift.
13194 if (ShiftAmt < 0) {
13195 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
13196 Mask, Size, Zeroable, Subtarget);
13197 V = V2;
13198 }
13199
13200 if (ShiftAmt < 0)
13201 return SDValue();
13202
13203 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13204, __extension__ __PRETTY_FUNCTION__))
13204 "Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13204, __extension__ __PRETTY_FUNCTION__))
;
13205 V = DAG.getBitcast(ShiftVT, V);
13206 V = DAG.getNode(Opcode, DL, ShiftVT, V,
13207 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
13208 return DAG.getBitcast(VT, V);
13209}
13210
13211// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
13212// Remainder of lower half result is zero and upper half is all undef.
13213static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
13214 ArrayRef<int> Mask, uint64_t &BitLen,
13215 uint64_t &BitIdx, const APInt &Zeroable) {
13216 int Size = Mask.size();
13217 int HalfSize = Size / 2;
13218 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13218, __extension__ __PRETTY_FUNCTION__))
;
13219 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnesValue() &&
"Fully zeroable shuffle mask") ? void (0) : __assert_fail ("!Zeroable.isAllOnesValue() && \"Fully zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13219, __extension__ __PRETTY_FUNCTION__))
;
13220
13221 // Upper half must be undefined.
13222 if (!isUndefUpperHalf(Mask))
13223 return false;
13224
13225 // Determine the extraction length from the part of the
13226 // lower half that isn't zeroable.
13227 int Len = HalfSize;
13228 for (; Len > 0; --Len)
13229 if (!Zeroable[Len - 1])
13230 break;
13231 assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13231, __extension__ __PRETTY_FUNCTION__))
;
13232
13233 // Attempt to match first Len sequential elements from the lower half.
13234 SDValue Src;
13235 int Idx = -1;
13236 for (int i = 0; i != Len; ++i) {
13237 int M = Mask[i];
13238 if (M == SM_SentinelUndef)
13239 continue;
13240 SDValue &V = (M < Size ? V1 : V2);
13241 M = M % Size;
13242
13243 // The extracted elements must start at a valid index and all mask
13244 // elements must be in the lower half.
13245 if (i > M || M >= HalfSize)
13246 return false;
13247
13248 if (Idx < 0 || (Src == V && Idx == (M - i))) {
13249 Src = V;
13250 Idx = M - i;
13251 continue;
13252 }
13253 return false;
13254 }
13255
13256 if (!Src || Idx < 0)
13257 return false;
13258
13259 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13259, __extension__ __PRETTY_FUNCTION__))
;
13260 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13261 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13262 V1 = Src;
13263 return true;
13264}
13265
13266// INSERTQ: Extract lowest Len elements from lower half of second source and
13267// insert over first source, starting at Idx.
13268// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
13269static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
13270 ArrayRef<int> Mask, uint64_t &BitLen,
13271 uint64_t &BitIdx) {
13272 int Size = Mask.size();
13273 int HalfSize = Size / 2;
13274 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13274, __extension__ __PRETTY_FUNCTION__))
;
13275
13276 // Upper half must be undefined.
13277 if (!isUndefUpperHalf(Mask))
13278 return false;
13279
13280 for (int Idx = 0; Idx != HalfSize; ++Idx) {
13281 SDValue Base;
13282
13283 // Attempt to match first source from mask before insertion point.
13284 if (isUndefInRange(Mask, 0, Idx)) {
13285 /* EMPTY */
13286 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
13287 Base = V1;
13288 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
13289 Base = V2;
13290 } else {
13291 continue;
13292 }
13293
13294 // Extend the extraction length looking to match both the insertion of
13295 // the second source and the remaining elements of the first.
13296 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
13297 SDValue Insert;
13298 int Len = Hi - Idx;
13299
13300 // Match insertion.
13301 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
13302 Insert = V1;
13303 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
13304 Insert = V2;
13305 } else {
13306 continue;
13307 }
13308
13309 // Match the remaining elements of the lower half.
13310 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
13311 /* EMPTY */
13312 } else if ((!Base || (Base == V1)) &&
13313 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
13314 Base = V1;
13315 } else if ((!Base || (Base == V2)) &&
13316 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
13317 Size + Hi)) {
13318 Base = V2;
13319 } else {
13320 continue;
13321 }
13322
13323 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
13324 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
13325 V1 = Base;
13326 V2 = Insert;
13327 return true;
13328 }
13329 }
13330
13331 return false;
13332}
13333
13334/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
13335static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
13336 SDValue V2, ArrayRef<int> Mask,
13337 const APInt &Zeroable, SelectionDAG &DAG) {
13338 uint64_t BitLen, BitIdx;
13339 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
13340 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
13341 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13342 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13343
13344 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
13345 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
13346 V2 ? V2 : DAG.getUNDEF(VT),
13347 DAG.getTargetConstant(BitLen, DL, MVT::i8),
13348 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
13349
13350 return SDValue();
13351}
13352
13353/// Lower a vector shuffle as a zero or any extension.
13354///
13355/// Given a specific number of elements, element bit width, and extension
13356/// stride, produce either a zero or any extension based on the available
13357/// features of the subtarget. The extended elements are consecutive and
13358/// begin and can start from an offsetted element index in the input; to
13359/// avoid excess shuffling the offset must either being in the bottom lane
13360/// or at the start of a higher lane. All extended elements must be from
13361/// the same lane.
13362static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
13363 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
13364 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13365 assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13365, __extension__ __PRETTY_FUNCTION__))
;
13366 int EltBits = VT.getScalarSizeInBits();
13367 int NumElements = VT.getVectorNumElements();
13368 int NumEltsPerLane = 128 / EltBits;
13369 int OffsetLane = Offset / NumEltsPerLane;
13370 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13371, __extension__ __PRETTY_FUNCTION__))
13371 "Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13371, __extension__ __PRETTY_FUNCTION__))
;
13372 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13372, __extension__ __PRETTY_FUNCTION__))
;
13373 assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13373, __extension__ __PRETTY_FUNCTION__))
;
13374 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13375, __extension__ __PRETTY_FUNCTION__))
13375 "Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13375, __extension__ __PRETTY_FUNCTION__))
;
13376
13377 // Check that an index is in same lane as the base offset.
13378 auto SafeOffset = [&](int Idx) {
13379 return OffsetLane == (Idx / NumEltsPerLane);
13380 };
13381
13382 // Shift along an input so that the offset base moves to the first element.
13383 auto ShuffleOffset = [&](SDValue V) {
13384 if (!Offset)
13385 return V;
13386
13387 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13388 for (int i = 0; i * Scale < NumElements; ++i) {
13389 int SrcIdx = i + Offset;
13390 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
13391 }
13392 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
13393 };
13394
13395 // Found a valid a/zext mask! Try various lowering strategies based on the
13396 // input type and available ISA extensions.
13397 if (Subtarget.hasSSE41()) {
13398 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
13399 // PUNPCK will catch this in a later shuffle match.
13400 if (Offset && Scale == 2 && VT.is128BitVector())
13401 return SDValue();
13402 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
13403 NumElements / Scale);
13404 InputV = ShuffleOffset(InputV);
13405 InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
13406 DL, ExtVT, InputV, DAG);
13407 return DAG.getBitcast(VT, InputV);
13408 }
13409
13410 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13410, __extension__ __PRETTY_FUNCTION__))
;
13411
13412 // For any extends we can cheat for larger element sizes and use shuffle
13413 // instructions that can fold with a load and/or copy.
13414 if (AnyExt && EltBits == 32) {
13415 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
13416 -1};
13417 return DAG.getBitcast(
13418 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13419 DAG.getBitcast(MVT::v4i32, InputV),
13420 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13421 }
13422 if (AnyExt && EltBits == 16 && Scale > 2) {
13423 int PSHUFDMask[4] = {Offset / 2, -1,
13424 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
13425 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
13426 DAG.getBitcast(MVT::v4i32, InputV),
13427 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13428 int PSHUFWMask[4] = {1, -1, -1, -1};
13429 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13430 return DAG.getBitcast(
13431 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
13432 DAG.getBitcast(MVT::v8i16, InputV),
13433 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
13434 }
13435
13436 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
13437 // to 64-bits.
13438 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13439 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13439, __extension__ __PRETTY_FUNCTION__))
;
13440 assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13440, __extension__ __PRETTY_FUNCTION__))
;
13441
13442 int LoIdx = Offset * EltBits;
13443 SDValue Lo = DAG.getBitcast(
13444 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13445 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13446 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
13447
13448 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
13449 return DAG.getBitcast(VT, Lo);
13450
13451 int HiIdx = (Offset + 1) * EltBits;
13452 SDValue Hi = DAG.getBitcast(
13453 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
13454 DAG.getTargetConstant(EltBits, DL, MVT::i8),
13455 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
13456 return DAG.getBitcast(VT,
13457 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
13458 }
13459
13460 // If this would require more than 2 unpack instructions to expand, use
13461 // pshufb when available. We can only use more than 2 unpack instructions
13462 // when zero extending i8 elements which also makes it easier to use pshufb.
13463 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
13464 assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13464, __extension__ __PRETTY_FUNCTION__))
;
13465 SDValue PSHUFBMask[16];
13466 for (int i = 0; i < 16; ++i) {
13467 int Idx = Offset + (i / Scale);
13468 if ((i % Scale == 0 && SafeOffset(Idx))) {
13469 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
13470 continue;
13471 }
13472 PSHUFBMask[i] =
13473 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
13474 }
13475 InputV = DAG.getBitcast(MVT::v16i8, InputV);
13476 return DAG.getBitcast(
13477 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
13478 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
13479 }
13480
13481 // If we are extending from an offset, ensure we start on a boundary that
13482 // we can unpack from.
13483 int AlignToUnpack = Offset % (NumElements / Scale);
13484 if (AlignToUnpack) {
13485 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
13486 for (int i = AlignToUnpack; i < NumElements; ++i)
13487 ShMask[i - AlignToUnpack] = i;
13488 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
13489 Offset -= AlignToUnpack;
13490 }
13491
13492 // Otherwise emit a sequence of unpacks.
13493 do {
13494 unsigned UnpackLoHi = X86ISD::UNPCKL;
13495 if (Offset >= (NumElements / 2)) {
13496 UnpackLoHi = X86ISD::UNPCKH;
13497 Offset -= (NumElements / 2);
13498 }
13499
13500 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
13501 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
13502 : getZeroVector(InputVT, Subtarget, DAG, DL);
13503 InputV = DAG.getBitcast(InputVT, InputV);
13504 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
13505 Scale /= 2;
13506 EltBits *= 2;
13507 NumElements /= 2;
13508 } while (Scale > 1);
13509 return DAG.getBitcast(VT, InputV);
13510}
13511
13512/// Try to lower a vector shuffle as a zero extension on any microarch.
13513///
13514/// This routine will try to do everything in its power to cleverly lower
13515/// a shuffle which happens to match the pattern of a zero extend. It doesn't
13516/// check for the profitability of this lowering, it tries to aggressively
13517/// match this pattern. It will use all of the micro-architectural details it
13518/// can to emit an efficient lowering. It handles both blends with all-zero
13519/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
13520/// masking out later).
13521///
13522/// The reason we have dedicated lowering for zext-style shuffles is that they
13523/// are both incredibly common and often quite performance sensitive.
13524static SDValue lowerShuffleAsZeroOrAnyExtend(
13525 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13526 const APInt &Zeroable, const X86Subtarget &Subtarget,
13527 SelectionDAG &DAG) {
13528 int Bits = VT.getSizeInBits();
13529 int NumLanes = Bits / 128;
13530 int NumElements = VT.getVectorNumElements();
13531 int NumEltsPerLane = NumElements / NumLanes;
13532 assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13533, __extension__ __PRETTY_FUNCTION__))
13533 "Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13533, __extension__ __PRETTY_FUNCTION__))
;
13534 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13534, __extension__ __PRETTY_FUNCTION__))
;
13535
13536 // Define a helper function to check a particular ext-scale and lower to it if
13537 // valid.
13538 auto Lower = [&](int Scale) -> SDValue {
13539 SDValue InputV;
13540 bool AnyExt = true;
13541 int Offset = 0;
13542 int Matches = 0;
13543 for (int i = 0; i < NumElements; ++i) {
13544 int M = Mask[i];
13545 if (M < 0)
13546 continue; // Valid anywhere but doesn't tell us anything.
13547 if (i % Scale != 0) {
13548 // Each of the extended elements need to be zeroable.
13549 if (!Zeroable[i])
13550 return SDValue();
13551
13552 // We no longer are in the anyext case.
13553 AnyExt = false;
13554 continue;
13555 }
13556
13557 // Each of the base elements needs to be consecutive indices into the
13558 // same input vector.
13559 SDValue V = M < NumElements ? V1 : V2;
13560 M = M % NumElements;
13561 if (!InputV) {
13562 InputV = V;
13563 Offset = M - (i / Scale);
13564 } else if (InputV != V)
13565 return SDValue(); // Flip-flopping inputs.
13566
13567 // Offset must start in the lowest 128-bit lane or at the start of an
13568 // upper lane.
13569 // FIXME: Is it ever worth allowing a negative base offset?
13570 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
13571 (Offset % NumEltsPerLane) == 0))
13572 return SDValue();
13573
13574 // If we are offsetting, all referenced entries must come from the same
13575 // lane.
13576 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13577 return SDValue();
13578
13579 if ((M % NumElements) != (Offset + (i / Scale)))
13580 return SDValue(); // Non-consecutive strided elements.
13581 Matches++;
13582 }
13583
13584 // If we fail to find an input, we have a zero-shuffle which should always
13585 // have already been handled.
13586 // FIXME: Maybe handle this here in case during blending we end up with one?
13587 if (!InputV)
13588 return SDValue();
13589
13590 // If we are offsetting, don't extend if we only match a single input, we
13591 // can always do better by using a basic PSHUF or PUNPCK.
13592 if (Offset != 0 && Matches < 2)
13593 return SDValue();
13594
13595 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
13596 InputV, Mask, Subtarget, DAG);
13597 };
13598
13599 // The widest scale possible for extending is to a 64-bit integer.
13600 assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13601, __extension__ __PRETTY_FUNCTION__))
13601 "The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13601, __extension__ __PRETTY_FUNCTION__))
;
13602 int NumExtElements = Bits / 64;
13603
13604 // Each iteration, try extending the elements half as much, but into twice as
13605 // many elements.
13606 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13607 assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13608, __extension__ __PRETTY_FUNCTION__))
13608 "The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13608, __extension__ __PRETTY_FUNCTION__))
;
13609 if (SDValue V = Lower(NumElements / NumExtElements))
13610 return V;
13611 }
13612
13613 // General extends failed, but 128-bit vectors may be able to use MOVQ.
13614 if (Bits != 128)
13615 return SDValue();
13616
13617 // Returns one of the source operands if the shuffle can be reduced to a
13618 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
13619 auto CanZExtLowHalf = [&]() {
13620 for (int i = NumElements / 2; i != NumElements; ++i)
13621 if (!Zeroable[i])
13622 return SDValue();
13623 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
13624 return V1;
13625 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
13626 return V2;
13627 return SDValue();
13628 };
13629
13630 if (SDValue V = CanZExtLowHalf()) {
13631 V = DAG.getBitcast(MVT::v2i64, V);
13632 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
13633 return DAG.getBitcast(VT, V);
13634 }
13635
13636 // No viable ext lowering found.
13637 return SDValue();
13638}
13639
13640/// Try to get a scalar value for a specific element of a vector.
13641///
13642/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
13643static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
13644 SelectionDAG &DAG) {
13645 MVT VT = V.getSimpleValueType();
13646 MVT EltVT = VT.getVectorElementType();
13647 V = peekThroughBitcasts(V);
13648
13649 // If the bitcasts shift the element size, we can't extract an equivalent
13650 // element from it.
13651 MVT NewVT = V.getSimpleValueType();
13652 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
13653 return SDValue();
13654
13655 if (V.getOpcode() == ISD::BUILD_VECTOR ||
13656 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
13657 // Ensure the scalar operand is the same size as the destination.
13658 // FIXME: Add support for scalar truncation where possible.
13659 SDValue S = V.getOperand(Idx);
13660 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
13661 return DAG.getBitcast(EltVT, S);
13662 }
13663
13664 return SDValue();
13665}
13666
13667/// Helper to test for a load that can be folded with x86 shuffles.
13668///
13669/// This is particularly important because the set of instructions varies
13670/// significantly based on whether the operand is a load or not.
13671static bool isShuffleFoldableLoad(SDValue V) {
13672 V = peekThroughBitcasts(V);
13673 return ISD::isNON_EXTLoad(V.getNode());
13674}
13675
13676/// Try to lower insertion of a single element into a zero vector.
13677///
13678/// This is a common pattern that we have especially efficient patterns to lower
13679/// across all subtarget feature sets.
13680static SDValue lowerShuffleAsElementInsertion(
13681 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13682 const APInt &Zeroable, const X86Subtarget &Subtarget,
13683 SelectionDAG &DAG) {
13684 MVT ExtVT = VT;
13685 MVT EltVT = VT.getVectorElementType();
13686
13687 int V2Index =
13688 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
13689 Mask.begin();
13690 bool IsV1Zeroable = true;
13691 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13692 if (i != V2Index && !Zeroable[i]) {
13693 IsV1Zeroable = false;
13694 break;
13695 }
13696
13697 // Check for a single input from a SCALAR_TO_VECTOR node.
13698 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
13699 // all the smarts here sunk into that routine. However, the current
13700 // lowering of BUILD_VECTOR makes that nearly impossible until the old
13701 // vector shuffle lowering is dead.
13702 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
13703 DAG);
13704 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
13705 // We need to zext the scalar if it is smaller than an i32.
13706 V2S = DAG.getBitcast(EltVT, V2S);
13707 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13708 // Using zext to expand a narrow element won't work for non-zero
13709 // insertions.
13710 if (!IsV1Zeroable)
13711 return SDValue();
13712
13713 // Zero-extend directly to i32.
13714 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
13715 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
13716 }
13717 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
13718 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
13719 EltVT == MVT::i16) {
13720 // Either not inserting from the low element of the input or the input
13721 // element size is too small to use VZEXT_MOVL to clear the high bits.
13722 return SDValue();
13723 }
13724
13725 if (!IsV1Zeroable) {
13726 // If V1 can't be treated as a zero vector we have fewer options to lower
13727 // this. We can't support integer vectors or non-zero targets cheaply, and
13728 // the V1 elements can't be permuted in any way.
13729 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13729, __extension__ __PRETTY_FUNCTION__))
;
13730 if (!VT.isFloatingPoint() || V2Index != 0)
13731 return SDValue();
13732 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
13733 V1Mask[V2Index] = -1;
13734 if (!isNoopShuffleMask(V1Mask))
13735 return SDValue();
13736 if (!VT.is128BitVector())
13737 return SDValue();
13738
13739 // Otherwise, use MOVSD, MOVSS or MOVSH.
13740 unsigned MovOpc = 0;
13741 if (EltVT == MVT::f16)
13742 MovOpc = X86ISD::MOVSH;
13743 else if (EltVT == MVT::f32)
13744 MovOpc = X86ISD::MOVSS;
13745 else if (EltVT == MVT::f64)
13746 MovOpc = X86ISD::MOVSD;
13747 else
13748 llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13748)
;
13749 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
13750 }
13751
13752 // This lowering only works for the low element with floating point vectors.
13753 if (VT.isFloatingPoint() && V2Index != 0)
13754 return SDValue();
13755
13756 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
13757 if (ExtVT != VT)
13758 V2 = DAG.getBitcast(VT, V2);
13759
13760 if (V2Index != 0) {
13761 // If we have 4 or fewer lanes we can cheaply shuffle the element into
13762 // the desired position. Otherwise it is more efficient to do a vector
13763 // shift left. We know that we can do a vector shift left because all
13764 // the inputs are zero.
13765 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
13766 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
13767 V2Shuffle[V2Index] = 0;
13768 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
13769 } else {
13770 V2 = DAG.getBitcast(MVT::v16i8, V2);
13771 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
13772 DAG.getTargetConstant(
13773 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
13774 V2 = DAG.getBitcast(VT, V2);
13775 }
13776 }
13777 return V2;
13778}
13779
13780/// Try to lower broadcast of a single - truncated - integer element,
13781/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
13782///
13783/// This assumes we have AVX2.
13784static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
13785 int BroadcastIdx,
13786 const X86Subtarget &Subtarget,
13787 SelectionDAG &DAG) {
13788 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13789, __extension__ __PRETTY_FUNCTION__))
13789 "We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13789, __extension__ __PRETTY_FUNCTION__))
;
13790
13791 MVT EltVT = VT.getVectorElementType();
13792 MVT V0VT = V0.getSimpleValueType();
13793
13794 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13794, __extension__ __PRETTY_FUNCTION__))
;
13795 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13795, __extension__ __PRETTY_FUNCTION__))
;
13796
13797 MVT V0EltVT = V0VT.getVectorElementType();
13798 if (!V0EltVT.isInteger())
13799 return SDValue();
13800
13801 const unsigned EltSize = EltVT.getSizeInBits();
13802 const unsigned V0EltSize = V0EltVT.getSizeInBits();
13803
13804 // This is only a truncation if the original element type is larger.
13805 if (V0EltSize <= EltSize)
13806 return SDValue();
13807
13808 assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13809, __extension__ __PRETTY_FUNCTION__))
13809 "Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13809, __extension__ __PRETTY_FUNCTION__))
;
13810
13811 const unsigned V0Opc = V0.getOpcode();
13812 const unsigned Scale = V0EltSize / EltSize;
13813 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13814
13815 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
13816 V0Opc != ISD::BUILD_VECTOR)
13817 return SDValue();
13818
13819 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
13820
13821 // If we're extracting non-least-significant bits, shift so we can truncate.
13822 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
13823 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
13824 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
13825 if (const int OffsetIdx = BroadcastIdx % Scale)
13826 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
13827 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
13828
13829 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
13830 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
13831}
13832
13833/// Test whether this can be lowered with a single SHUFPS instruction.
13834///
13835/// This is used to disable more specialized lowerings when the shufps lowering
13836/// will happen to be efficient.
13837static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
13838 // This routine only handles 128-bit shufps.
13839 assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13839, __extension__ __PRETTY_FUNCTION__))
;
13840 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13840, __extension__ __PRETTY_FUNCTION__))
;
13841 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13841, __extension__ __PRETTY_FUNCTION__))
;
13842 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13842, __extension__ __PRETTY_FUNCTION__))
;
13843 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13843, __extension__ __PRETTY_FUNCTION__))
;
13844
13845 // To lower with a single SHUFPS we need to have the low half and high half
13846 // each requiring a single input.
13847 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13848 return false;
13849 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13850 return false;
13851
13852 return true;
13853}
13854
13855/// If we are extracting two 128-bit halves of a vector and shuffling the
13856/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
13857/// multi-shuffle lowering.
13858static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
13859 SDValue N1, ArrayRef<int> Mask,
13860 SelectionDAG &DAG) {
13861 MVT VT = N0.getSimpleValueType();
13862 assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13864, __extension__ __PRETTY_FUNCTION__))
13863 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13864, __extension__ __PRETTY_FUNCTION__))
13864 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13864, __extension__ __PRETTY_FUNCTION__))
;
13865
13866 // Check that both sources are extracts of the same source vector.
13867 if (!N0.hasOneUse() || !N1.hasOneUse() ||
13868 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13869 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13870 N0.getOperand(0) != N1.getOperand(0))
13871 return SDValue();
13872
13873 SDValue WideVec = N0.getOperand(0);
13874 MVT WideVT = WideVec.getSimpleValueType();
13875 if (!WideVT.is256BitVector())
13876 return SDValue();
13877
13878 // Match extracts of each half of the wide source vector. Commute the shuffle
13879 // if the extract of the low half is N1.
13880 unsigned NumElts = VT.getVectorNumElements();
13881 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13882 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13883 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13884 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13885 ShuffleVectorSDNode::commuteMask(NewMask);
13886 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13887 return SDValue();
13888
13889 // Final bailout: if the mask is simple, we are better off using an extract
13890 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13891 // because that avoids a constant load from memory.
13892 if (NumElts == 4 &&
13893 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
13894 return SDValue();
13895
13896 // Extend the shuffle mask with undef elements.
13897 NewMask.append(NumElts, -1);
13898
13899 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13900 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13901 NewMask);
13902 // This is free: ymm -> xmm.
13903 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13904 DAG.getIntPtrConstant(0, DL));
13905}
13906
13907/// Try to lower broadcast of a single element.
13908///
13909/// For convenience, this code also bundles all of the subtarget feature set
13910/// filtering. While a little annoying to re-dispatch on type here, there isn't
13911/// a convenient way to factor it out.
13912static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
13913 SDValue V2, ArrayRef<int> Mask,
13914 const X86Subtarget &Subtarget,
13915 SelectionDAG &DAG) {
13916 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13917 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
13918 (Subtarget.hasAVX2() && VT.isInteger())))
13919 return SDValue();
13920
13921 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13922 // we can only broadcast from a register with AVX2.
13923 unsigned NumEltBits = VT.getScalarSizeInBits();
13924 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13925 ? X86ISD::MOVDDUP
13926 : X86ISD::VBROADCAST;
13927 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13928
13929 // Check that the mask is a broadcast.
13930 int BroadcastIdx = getSplatIndex(Mask);
13931 if (BroadcastIdx < 0)
13932 return SDValue();
13933 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13935, __extension__ __PRETTY_FUNCTION__))
13934 "a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13935, __extension__ __PRETTY_FUNCTION__))
13935 "comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13935, __extension__ __PRETTY_FUNCTION__))
;
13936
13937 // Go up the chain of (vector) values to find a scalar load that we can
13938 // combine with the broadcast.
13939 // TODO: Combine this logic with findEltLoadSrc() used by
13940 // EltsFromConsecutiveLoads().
13941 int BitOffset = BroadcastIdx * NumEltBits;
13942 SDValue V = V1;
13943 for (;;) {
13944 switch (V.getOpcode()) {
13945 case ISD::BITCAST: {
13946 V = V.getOperand(0);
13947 continue;
13948 }
13949 case ISD::CONCAT_VECTORS: {
13950 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13951 int OpIdx = BitOffset / OpBitWidth;
13952 V = V.getOperand(OpIdx);
13953 BitOffset %= OpBitWidth;
13954 continue;
13955 }
13956 case ISD::EXTRACT_SUBVECTOR: {
13957 // The extraction index adds to the existing offset.
13958 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13959 unsigned Idx = V.getConstantOperandVal(1);
13960 unsigned BeginOffset = Idx * EltBitWidth;
13961 BitOffset += BeginOffset;
13962 V = V.getOperand(0);
13963 continue;
13964 }
13965 case ISD::INSERT_SUBVECTOR: {
13966 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13967 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13968 int Idx = (int)V.getConstantOperandVal(2);
13969 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13970 int BeginOffset = Idx * EltBitWidth;
13971 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13972 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13973 BitOffset -= BeginOffset;
13974 V = VInner;
13975 } else {
13976 V = VOuter;
13977 }
13978 continue;
13979 }
13980 }
13981 break;
13982 }
13983 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13983, __extension__ __PRETTY_FUNCTION__))
;
13984 BroadcastIdx = BitOffset / NumEltBits;
13985
13986 // Do we need to bitcast the source to retrieve the original broadcast index?
13987 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13988
13989 // Check if this is a broadcast of a scalar. We special case lowering
13990 // for scalars so that we can more effectively fold with loads.
13991 // If the original value has a larger element type than the shuffle, the
13992 // broadcast element is in essence truncated. Make that explicit to ease
13993 // folding.
13994 if (BitCastSrc && VT.isInteger())
13995 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13996 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13997 return TruncBroadcast;
13998
13999 // Also check the simpler case, where we can directly reuse the scalar.
14000 if (!BitCastSrc &&
14001 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
14002 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
14003 V = V.getOperand(BroadcastIdx);
14004
14005 // If we can't broadcast from a register, check that the input is a load.
14006 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
14007 return SDValue();
14008 } else if (ISD::isNormalLoad(V.getNode()) &&
14009 cast<LoadSDNode>(V)->isSimple()) {
14010 // We do not check for one-use of the vector load because a broadcast load
14011 // is expected to be a win for code size, register pressure, and possibly
14012 // uops even if the original vector load is not eliminated.
14013
14014 // Reduce the vector load and shuffle to a broadcasted scalar load.
14015 LoadSDNode *Ld = cast<LoadSDNode>(V);
14016 SDValue BaseAddr = Ld->getOperand(1);
14017 MVT SVT = VT.getScalarType();
14018 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
14019 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14019, __extension__ __PRETTY_FUNCTION__))
;
14020 SDValue NewAddr =
14021 DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);
14022
14023 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
14024 // than MOVDDUP.
14025 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
14026 if (Opcode == X86ISD::VBROADCAST) {
14027 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
14028 SDValue Ops[] = {Ld->getChain(), NewAddr};
14029 V = DAG.getMemIntrinsicNode(
14030 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
14031 DAG.getMachineFunction().getMachineMemOperand(
14032 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14033 DAG.makeEquivalentMemoryOrdering(Ld, V);
14034 return DAG.getBitcast(VT, V);
14035 }
14036 assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14036, __extension__ __PRETTY_FUNCTION__))
;
14037 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
14038 DAG.getMachineFunction().getMachineMemOperand(
14039 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
14040 DAG.makeEquivalentMemoryOrdering(Ld, V);
14041 } else if (!BroadcastFromReg) {
14042 // We can't broadcast from a vector register.
14043 return SDValue();
14044 } else if (BitOffset != 0) {
14045 // We can only broadcast from the zero-element of a vector register,
14046 // but it can be advantageous to broadcast from the zero-element of a
14047 // subvector.
14048 if (!VT.is256BitVector() && !VT.is512BitVector())
14049 return SDValue();
14050
14051 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
14052 if (VT == MVT::v4f64 || VT == MVT::v4i64)
14053 return SDValue();
14054
14055 // Only broadcast the zero-element of a 128-bit subvector.
14056 if ((BitOffset % 128) != 0)
14057 return SDValue();
14058
14059 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14060, __extension__ __PRETTY_FUNCTION__))
14060 "Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14060, __extension__ __PRETTY_FUNCTION__))
;
14061 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14062, __extension__ __PRETTY_FUNCTION__))
14062 "Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14062, __extension__ __PRETTY_FUNCTION__))
;
14063 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
14064 V = extract128BitVector(V, ExtractIdx, DAG, DL);
14065 }
14066
14067 // On AVX we can use VBROADCAST directly for scalar sources.
14068 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
14069 V = DAG.getBitcast(MVT::f64, V);
14070 if (Subtarget.hasAVX()) {
14071 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
14072 return DAG.getBitcast(VT, V);
14073 }
14074 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
14075 }
14076
14077 // If this is a scalar, do the broadcast on this type and bitcast.
14078 if (!V.getValueType().isVector()) {
14079 assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14080, __extension__ __PRETTY_FUNCTION__))
14080 "Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14080, __extension__ __PRETTY_FUNCTION__))
;
14081 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
14082 VT.getVectorNumElements());
14083 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
14084 }
14085
14086 // We only support broadcasting from 128-bit vectors to minimize the
14087 // number of patterns we need to deal with in isel. So extract down to
14088 // 128-bits, removing as many bitcasts as possible.
14089 if (V.getValueSizeInBits() > 128)
14090 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
14091
14092 // Otherwise cast V to a vector with the same element type as VT, but
14093 // possibly narrower than VT. Then perform the broadcast.
14094 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
14095 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
14096 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
14097}
14098
14099// Check for whether we can use INSERTPS to perform the shuffle. We only use
14100// INSERTPS when the V1 elements are already in the correct locations
14101// because otherwise we can just always use two SHUFPS instructions which
14102// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
14103// perform INSERTPS if a single V1 element is out of place and all V2
14104// elements are zeroable.
14105static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
14106 unsigned &InsertPSMask,
14107 const APInt &Zeroable,
14108 ArrayRef<int> Mask, SelectionDAG &DAG) {
14109 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14109, __extension__ __PRETTY_FUNCTION__))
;
14110 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14110, __extension__ __PRETTY_FUNCTION__))
;
14111 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14111, __extension__ __PRETTY_FUNCTION__))
;
14112
14113 // Attempt to match INSERTPS with one element from VA or VB being
14114 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
14115 // are updated.
14116 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
14117 ArrayRef<int> CandidateMask) {
14118 unsigned ZMask = 0;
14119 int VADstIndex = -1;
14120 int VBDstIndex = -1;
14121 bool VAUsedInPlace = false;
14122
14123 for (int i = 0; i < 4; ++i) {
14124 // Synthesize a zero mask from the zeroable elements (includes undefs).
14125 if (Zeroable[i]) {
14126 ZMask |= 1 << i;
14127 continue;
14128 }
14129
14130 // Flag if we use any VA inputs in place.
14131 if (i == CandidateMask[i]) {
14132 VAUsedInPlace = true;
14133 continue;
14134 }
14135
14136 // We can only insert a single non-zeroable element.
14137 if (VADstIndex >= 0 || VBDstIndex >= 0)
14138 return false;
14139
14140 if (CandidateMask[i] < 4) {
14141 // VA input out of place for insertion.
14142 VADstIndex = i;
14143 } else {
14144 // VB input for insertion.
14145 VBDstIndex = i;
14146 }
14147 }
14148
14149 // Don't bother if we have no (non-zeroable) element for insertion.
14150 if (VADstIndex < 0 && VBDstIndex < 0)
14151 return false;
14152
14153 // Determine element insertion src/dst indices. The src index is from the
14154 // start of the inserted vector, not the start of the concatenated vector.
14155 unsigned VBSrcIndex = 0;
14156 if (VADstIndex >= 0) {
14157 // If we have a VA input out of place, we use VA as the V2 element
14158 // insertion and don't use the original V2 at all.
14159 VBSrcIndex = CandidateMask[VADstIndex];
14160 VBDstIndex = VADstIndex;
14161 VB = VA;
14162 } else {
14163 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
14164 }
14165
14166 // If no V1 inputs are used in place, then the result is created only from
14167 // the zero mask and the V2 insertion - so remove V1 dependency.
14168 if (!VAUsedInPlace)
14169 VA = DAG.getUNDEF(MVT::v4f32);
14170
14171 // Update V1, V2 and InsertPSMask accordingly.
14172 V1 = VA;
14173 V2 = VB;
14174
14175 // Insert the V2 element into the desired position.
14176 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
14177 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14177, __extension__ __PRETTY_FUNCTION__))
;
14178 return true;
14179 };
14180
14181 if (matchAsInsertPS(V1, V2, Mask))
14182 return true;
14183
14184 // Commute and try again.
14185 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
14186 ShuffleVectorSDNode::commuteMask(CommutedMask);
14187 if (matchAsInsertPS(V2, V1, CommutedMask))
14188 return true;
14189
14190 return false;
14191}
14192
14193static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
14194 ArrayRef<int> Mask, const APInt &Zeroable,
14195 SelectionDAG &DAG) {
14196 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14196, __extension__ __PRETTY_FUNCTION__))
;
14197 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14197, __extension__ __PRETTY_FUNCTION__))
;
14198
14199 // Attempt to match the insertps pattern.
14200 unsigned InsertPSMask = 0;
14201 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
14202 return SDValue();
14203
14204 // Insert the V2 element into the desired position.
14205 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
14206 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
14207}
14208
14209/// Try to lower a shuffle as a permute of the inputs followed by an
14210/// UNPCK instruction.
14211///
14212/// This specifically targets cases where we end up with alternating between
14213/// the two inputs, and so can permute them into something that feeds a single
14214/// UNPCK instruction. Note that this routine only targets integer vectors
14215/// because for floating point vectors we have a generalized SHUFPS lowering
14216/// strategy that handles everything that doesn't *exactly* match an unpack,
14217/// making this clever lowering unnecessary.
14218static SDValue lowerShuffleAsPermuteAndUnpack(
14219 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14220 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14221 assert(!VT.isFloatingPoint() &&(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14222, __extension__ __PRETTY_FUNCTION__))
14222 "This routine only supports integer vectors.")(static_cast <bool> (!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? void (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14222, __extension__ __PRETTY_FUNCTION__))
;
14223 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14224, __extension__ __PRETTY_FUNCTION__))
14224 "This routine only works on 128-bit vectors.")(static_cast <bool> (VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14224, __extension__ __PRETTY_FUNCTION__))
;
14225 assert(!V2.isUndef() &&(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14226, __extension__ __PRETTY_FUNCTION__))
14226 "This routine should only be used when blending two inputs.")(static_cast <bool> (!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14226, __extension__ __PRETTY_FUNCTION__))
;
14227 assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14227, __extension__ __PRETTY_FUNCTION__))
;
14228
14229 int Size = Mask.size();
14230
14231 int NumLoInputs =
14232 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
14233 int NumHiInputs =
14234 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
14235
14236 bool UnpackLo = NumLoInputs >= NumHiInputs;
14237
14238 auto TryUnpack = [&](int ScalarSize, int Scale) {
14239 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
14240 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
14241
14242 for (int i = 0; i < Size; ++i) {
14243 if (Mask[i] < 0)
14244 continue;
14245
14246 // Each element of the unpack contains Scale elements from this mask.
14247 int UnpackIdx = i / Scale;
14248
14249 // We only handle the case where V1 feeds the first slots of the unpack.
14250 // We rely on canonicalization to ensure this is the case.
14251 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
14252 return SDValue();
14253
14254 // Setup the mask for this input. The indexing is tricky as we have to
14255 // handle the unpack stride.
14256 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
14257 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
14258 Mask[i] % Size;
14259 }
14260
14261 // If we will have to shuffle both inputs to use the unpack, check whether
14262 // we can just unpack first and shuffle the result. If so, skip this unpack.
14263 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
14264 !isNoopShuffleMask(V2Mask))
14265 return SDValue();
14266
14267 // Shuffle the inputs into place.
14268 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
14269 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
14270
14271 // Cast the inputs to the type we will use to unpack them.
14272 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
14273 V1 = DAG.getBitcast(UnpackVT, V1);
14274 V2 = DAG.getBitcast(UnpackVT, V2);
14275
14276 // Unpack the inputs and cast the result back to the desired type.
14277 return DAG.getBitcast(
14278 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14279 UnpackVT, V1, V2));
14280 };
14281
14282 // We try each unpack from the largest to the smallest to try and find one
14283 // that fits this mask.
14284 int OrigScalarSize = VT.getScalarSizeInBits();
14285 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
14286 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
14287 return Unpack;
14288
14289 // If we're shuffling with a zero vector then we're better off not doing
14290 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
14291 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
14292 ISD::isBuildVectorAllZeros(V2.getNode()))
14293 return SDValue();
14294
14295 // If none of the unpack-rooted lowerings worked (or were profitable) try an
14296 // initial unpack.
14297 if (NumLoInputs == 0 || NumHiInputs == 0) {
14298 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14299, __extension__ __PRETTY_FUNCTION__))
14299 "We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14299, __extension__ __PRETTY_FUNCTION__))
;
14300 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
14301
14302 // FIXME: We could consider the total complexity of the permute of each
14303 // possible unpacking. Or at the least we should consider how many
14304 // half-crossings are created.
14305 // FIXME: We could consider commuting the unpacks.
14306
14307 SmallVector<int, 32> PermMask((unsigned)Size, -1);
14308 for (int i = 0; i < Size; ++i) {
14309 if (Mask[i] < 0)
14310 continue;
14311
14312 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14312, __extension__ __PRETTY_FUNCTION__))
;
14313
14314 PermMask[i] =
14315 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
14316 }
14317 return DAG.getVectorShuffle(
14318 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
14319 DL, VT, V1, V2),
14320 DAG.getUNDEF(VT), PermMask);
14321 }
14322
14323 return SDValue();
14324}
14325
14326/// Handle lowering of 2-lane 64-bit floating point shuffles.
14327///
14328/// This is the basis function for the 2-lane 64-bit shuffles as we have full
14329/// support for floating point shuffles but not integer shuffles. These
14330/// instructions will incur a domain crossing penalty on some chips though so
14331/// it is better to avoid lowering through this for integer vectors where
14332/// possible.
14333static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14334 const APInt &Zeroable, SDValue V1, SDValue V2,
14335 const X86Subtarget &Subtarget,
14336 SelectionDAG &DAG) {
14337 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14337, __extension__ __PRETTY_FUNCTION__))
;
14338 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14338, __extension__ __PRETTY_FUNCTION__))
;
14339 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14339, __extension__ __PRETTY_FUNCTION__))
;
14340
14341 if (V2.isUndef()) {
14342 // Check for being able to broadcast a single element.
14343 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
14344 Mask, Subtarget, DAG))
14345 return Broadcast;
14346
14347 // Straight shuffle of a single input vector. Simulate this by using the
14348 // single input as both of the "inputs" to this instruction..
14349 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
14350
14351 if (Subtarget.hasAVX()) {
14352 // If we have AVX, we can use VPERMILPS which will allow folding a load
14353 // into the shuffle.
14354 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
14355 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14356 }
14357
14358 return DAG.getNode(
14359 X86ISD::SHUFP, DL, MVT::v2f64,
14360 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14361 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
14362 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14363 }
14364 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14364, __extension__ __PRETTY_FUNCTION__))
;
14365 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14365, __extension__ __PRETTY_FUNCTION__))
;
14366 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14366, __extension__ __PRETTY_FUNCTION__))
;
14367 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14367, __extension__ __PRETTY_FUNCTION__))
;
14368
14369 if (Subtarget.hasAVX2())
14370 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14371 return Extract;
14372
14373 // When loading a scalar and then shuffling it into a vector we can often do
14374 // the insertion cheaply.
14375 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14376 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14377 return Insertion;
14378 // Try inverting the insertion since for v2 masks it is easy to do and we
14379 // can't reliably sort the mask one way or the other.
14380 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
14381 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
14382 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14383 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14384 return Insertion;
14385
14386 // Try to use one of the special instruction patterns to handle two common
14387 // blend patterns if a zero-blend above didn't work.
14388 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
14389 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
14390 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
14391 // We can either use a special instruction to load over the low double or
14392 // to move just the low double.
14393 return DAG.getNode(
14394 X86ISD::MOVSD, DL, MVT::v2f64, V2,
14395 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
14396
14397 if (Subtarget.hasSSE41())
14398 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
14399 Zeroable, Subtarget, DAG))
14400 return Blend;
14401
14402 // Use dedicated unpack instructions for masks that match their pattern.
14403 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
14404 return V;
14405
14406 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
14407 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
14408 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
14409}
14410
14411/// Handle lowering of 2-lane 64-bit integer shuffles.
14412///
14413/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
14414/// the integer unit to minimize domain crossing penalties. However, for blends
14415/// it falls back to the floating point shuffle operation with appropriate bit
14416/// casting.
14417static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14418 const APInt &Zeroable, SDValue V1, SDValue V2,
14419 const X86Subtarget &Subtarget,
14420 SelectionDAG &DAG) {
14421 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14421, __extension__ __PRETTY_FUNCTION__))
;
14422 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14422, __extension__ __PRETTY_FUNCTION__))
;
14423 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14423, __extension__ __PRETTY_FUNCTION__))
;
14424
14425 if (V2.isUndef()) {
14426 // Check for being able to broadcast a single element.
14427 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
14428 Mask, Subtarget, DAG))
14429 return Broadcast;
14430
14431 // Straight shuffle of a single input vector. For everything from SSE2
14432 // onward this has a single fast instruction with no scary immediates.
14433 // We have to map the mask as it is actually a v4i32 shuffle instruction.
14434 V1 = DAG.getBitcast(MVT::v4i32, V1);
14435 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
14436 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
14437 Mask[1] < 0 ? -1 : (Mask[1] * 2),
14438 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
14439 return DAG.getBitcast(
14440 MVT::v2i64,
14441 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14442 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
14443 }
14444 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14444, __extension__ __PRETTY_FUNCTION__))
;
14445 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14445, __extension__ __PRETTY_FUNCTION__))
;
14446 assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14446, __extension__ __PRETTY_FUNCTION__))
;
14447 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14447, __extension__ __PRETTY_FUNCTION__))
;
14448
14449 if (Subtarget.hasAVX2())
14450 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14451 return Extract;
14452
14453 // Try to use shift instructions.
14454 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
14455 Zeroable, Subtarget, DAG))
14456 return Shift;
14457
14458 // When loading a scalar and then shuffling it into a vector we can often do
14459 // the insertion cheaply.
14460 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14461 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14462 return Insertion;
14463 // Try inverting the insertion since for v2 masks it is easy to do and we
14464 // can't reliably sort the mask one way or the other.
14465 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14466 if (SDValue Insertion = lowerShuffleAsElementInsertion(
14467 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14468 return Insertion;
14469
14470 // We have different paths for blend lowering, but they all must use the
14471 // *exact* same predicate.
14472 bool IsBlendSupported = Subtarget.hasSSE41();
14473 if (IsBlendSupported)
14474 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
14475 Zeroable, Subtarget, DAG))
14476 return Blend;
14477
14478 // Use dedicated unpack instructions for masks that match their pattern.
14479 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
14480 return V;
14481
14482 // Try to use byte rotation instructions.
14483 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14484 if (Subtarget.hasSSSE3()) {
14485 if (Subtarget.hasVLX())
14486 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
14487 Subtarget, DAG))
14488 return Rotate;
14489
14490 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
14491 Subtarget, DAG))
14492 return Rotate;
14493 }
14494
14495 // If we have direct support for blends, we should lower by decomposing into
14496 // a permute. That will be faster than the domain cross.
14497 if (IsBlendSupported)
14498 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
14499 Subtarget, DAG);
14500
14501 // We implement this with SHUFPD which is pretty lame because it will likely
14502 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
14503 // However, all the alternatives are still more cycles and newer chips don't
14504 // have this problem. It would be really nice if x86 had better shuffles here.
14505 V1 = DAG.getBitcast(MVT::v2f64, V1);
14506 V2 = DAG.getBitcast(MVT::v2f64, V2);
14507 return DAG.getBitcast(MVT::v2i64,
14508 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
14509}
14510
14511/// Lower a vector shuffle using the SHUFPS instruction.
14512///
14513/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
14514/// It makes no assumptions about whether this is the *best* lowering, it simply
14515/// uses it.
14516static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
14517 ArrayRef<int> Mask, SDValue V1,
14518 SDValue V2, SelectionDAG &DAG) {
14519 SDValue LowV = V1, HighV = V2;
14520 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
14521 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14522
14523 if (NumV2Elements == 1) {
14524 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
14525
14526 // Compute the index adjacent to V2Index and in the same half by toggling
14527 // the low bit.
14528 int V2AdjIndex = V2Index ^ 1;
14529
14530 if (Mask[V2AdjIndex] < 0) {
14531 // Handles all the cases where we have a single V2 element and an undef.
14532 // This will only ever happen in the high lanes because we commute the
14533 // vector otherwise.
14534 if (V2Index < 2)
14535 std::swap(LowV, HighV);
14536 NewMask[V2Index] -= 4;
14537 } else {
14538 // Handle the case where the V2 element ends up adjacent to a V1 element.
14539 // To make this work, blend them together as the first step.
14540 int V1Index = V2AdjIndex;
14541 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14542 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
14543 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14544
14545 // Now proceed to reconstruct the final blend as we have the necessary
14546 // high or low half formed.
14547 if (V2Index < 2) {
14548 LowV = V2;
14549 HighV = V1;
14550 } else {
14551 HighV = V2;
14552 }
14553 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
14554 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
14555 }
14556 } else if (NumV2Elements == 2) {
14557 if (Mask[0] < 4 && Mask[1] < 4) {
14558 // Handle the easy case where we have V1 in the low lanes and V2 in the
14559 // high lanes.
14560 NewMask[2] -= 4;
14561 NewMask[3] -= 4;
14562 } else if (Mask[2] < 4 && Mask[3] < 4) {
14563 // We also handle the reversed case because this utility may get called
14564 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
14565 // arrange things in the right direction.
14566 NewMask[0] -= 4;
14567 NewMask[1] -= 4;
14568 HighV = V1;
14569 LowV = V2;
14570 } else {
14571 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
14572 // trying to place elements directly, just blend them and set up the final
14573 // shuffle to place them.
14574
14575 // The first two blend mask elements are for V1, the second two are for
14576 // V2.
14577 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14578 Mask[2] < 4 ? Mask[2] : Mask[3],
14579 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14580 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14581 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
14582 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
14583
14584 // Now we do a normal shuffle of V1 by giving V1 as both operands to
14585 // a blend.
14586 LowV = HighV = V1;
14587 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14588 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14589 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14590 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14591 }
14592 } else if (NumV2Elements == 3) {
14593 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
14594 // we can get here due to other paths (e.g repeated mask matching) that we
14595 // don't want to do another round of lowerVECTOR_SHUFFLE.
14596 ShuffleVectorSDNode::commuteMask(NewMask);
14597 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
14598 }
14599 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
14600 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
14601}
14602
14603/// Lower 4-lane 32-bit floating point shuffles.
14604///
14605/// Uses instructions exclusively from the floating point unit to minimize
14606/// domain crossing penalties, as these are sufficient to implement all v4f32
14607/// shuffles.
14608static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14609 const APInt &Zeroable, SDValue V1, SDValue V2,
14610 const X86Subtarget &Subtarget,
14611 SelectionDAG &DAG) {
14612 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14612, __extension__ __PRETTY_FUNCTION__))
;
14613 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14613, __extension__ __PRETTY_FUNCTION__))
;
14614 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14614, __extension__ __PRETTY_FUNCTION__))
;
14615
14616 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14617
14618 if (NumV2Elements == 0) {
14619 // Check for being able to broadcast a single element.
14620 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
14621 Mask, Subtarget, DAG))
14622 return Broadcast;
14623
14624 // Use even/odd duplicate instructions for masks that match their pattern.
14625 if (Subtarget.hasSSE3()) {
14626 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
14627 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
14628 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
14629 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
14630 }
14631
14632 if (Subtarget.hasAVX()) {
14633 // If we have AVX, we can use VPERMILPS which will allow folding a load
14634 // into the shuffle.
14635 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
14636 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14637 }
14638
14639 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
14640 // in SSE1 because otherwise they are widened to v2f64 and never get here.
14641 if (!Subtarget.hasSSE2()) {
14642 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
14643 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
14644 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
14645 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
14646 }
14647
14648 // Otherwise, use a straight shuffle of a single input vector. We pass the
14649 // input vector to both operands to simulate this with a SHUFPS.
14650 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
14651 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14652 }
14653
14654 if (Subtarget.hasAVX2())
14655 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14656 return Extract;
14657
14658 // There are special ways we can lower some single-element blends. However, we
14659 // have custom ways we can lower more complex single-element blends below that
14660 // we defer to if both this and BLENDPS fail to match, so restrict this to
14661 // when the V2 input is targeting element 0 of the mask -- that is the fast
14662 // case here.
14663 if (NumV2Elements == 1 && Mask[0] >= 4)
14664 if (SDValue V = lowerShuffleAsElementInsertion(
14665 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14666 return V;
14667
14668 if (Subtarget.hasSSE41()) {
14669 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
14670 Zeroable, Subtarget, DAG))
14671 return Blend;
14672
14673 // Use INSERTPS if we can complete the shuffle efficiently.
14674 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
14675 return V;
14676
14677 if (!isSingleSHUFPSMask(Mask))
14678 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
14679 V2, Mask, DAG))
14680 return BlendPerm;
14681 }
14682
14683 // Use low/high mov instructions. These are only valid in SSE1 because
14684 // otherwise they are widened to v2f64 and never get here.
14685 if (!Subtarget.hasSSE2()) {
14686 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
14687 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
14688 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
14689 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
14690 }
14691
14692 // Use dedicated unpack instructions for masks that match their pattern.
14693 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
14694 return V;
14695
14696 // Otherwise fall back to a SHUFPS lowering strategy.
14697 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
14698}
14699
14700/// Lower 4-lane i32 vector shuffles.
14701///
14702/// We try to handle these with integer-domain shuffles where we can, but for
14703/// blends we use the floating point domain blend instructions.
14704static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14705 const APInt &Zeroable, SDValue V1, SDValue V2,
14706 const X86Subtarget &Subtarget,
14707 SelectionDAG &DAG) {
14708 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14708, __extension__ __PRETTY_FUNCTION__))
;
14709 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14709, __extension__ __PRETTY_FUNCTION__))
;
14710 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14710, __extension__ __PRETTY_FUNCTION__))
;
14711
14712 // Whenever we can lower this as a zext, that instruction is strictly faster
14713 // than any alternative. It also allows us to fold memory operands into the
14714 // shuffle in many cases.
14715 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
14716 Zeroable, Subtarget, DAG))
14717 return ZExt;
14718
14719 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
14720
14721 if (NumV2Elements == 0) {
14722 // Try to use broadcast unless the mask only has one non-undef element.
14723 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
14724 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
14725 Mask, Subtarget, DAG))
14726 return Broadcast;
14727 }
14728
14729 // Straight shuffle of a single input vector. For everything from SSE2
14730 // onward this has a single fast instruction with no scary immediates.
14731 // We coerce the shuffle pattern to be compatible with UNPCK instructions
14732 // but we aren't actually going to use the UNPCK instruction because doing
14733 // so prevents folding a load into this instruction or making a copy.
14734 const int UnpackLoMask[] = {0, 0, 1, 1};
14735 const int UnpackHiMask[] = {2, 2, 3, 3};
14736 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
14737 Mask = UnpackLoMask;
14738 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
14739 Mask = UnpackHiMask;
14740
14741 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
14742 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
14743 }
14744
14745 if (Subtarget.hasAVX2())
14746 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
14747 return Extract;
14748
14749 // Try to use shift instructions.
14750 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
14751 Zeroable, Subtarget, DAG))
14752 return Shift;
14753
14754 // There are special ways we can lower some single-element blends.
14755 if (NumV2Elements == 1)
14756 if (SDValue V = lowerShuffleAsElementInsertion(
14757 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14758 return V;
14759
14760 // We have different paths for blend lowering, but they all must use the
14761 // *exact* same predicate.
14762 bool IsBlendSupported = Subtarget.hasSSE41();
14763 if (IsBlendSupported)
14764 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
14765 Zeroable, Subtarget, DAG))
14766 return Blend;
14767
14768 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
14769 Zeroable, Subtarget, DAG))
14770 return Masked;
14771
14772 // Use dedicated unpack instructions for masks that match their pattern.
14773 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
14774 return V;
14775
14776 // Try to use byte rotation instructions.
14777 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
14778 if (Subtarget.hasSSSE3()) {
14779 if (Subtarget.hasVLX())
14780 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14781 Subtarget, DAG))
14782 return Rotate;
14783
14784 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14785 Subtarget, DAG))
14786 return Rotate;
14787 }
14788
14789 // Assume that a single SHUFPS is faster than an alternative sequence of
14790 // multiple instructions (even if the CPU has a domain penalty).
14791 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14792 if (!isSingleSHUFPSMask(Mask)) {
14793 // If we have direct support for blends, we should lower by decomposing into
14794 // a permute. That will be faster than the domain cross.
14795 if (IsBlendSupported)
14796 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
14797 Subtarget, DAG);
14798
14799 // Try to lower by permuting the inputs into an unpack instruction.
14800 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
14801 Mask, Subtarget, DAG))
14802 return Unpack;
14803 }
14804
14805 // We implement this with SHUFPS because it can blend from two vectors.
14806 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
14807 // up the inputs, bypassing domain shift penalties that we would incur if we
14808 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
14809 // relevant.
14810 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
14811 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
14812 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
14813 return DAG.getBitcast(MVT::v4i32, ShufPS);
14814}
14815
14816/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
14817/// shuffle lowering, and the most complex part.
14818///
14819/// The lowering strategy is to try to form pairs of input lanes which are
14820/// targeted at the same half of the final vector, and then use a dword shuffle
14821/// to place them onto the right half, and finally unpack the paired lanes into
14822/// their final position.
14823///
14824/// The exact breakdown of how to form these dword pairs and align them on the
14825/// correct sides is really tricky. See the comments within the function for
14826/// more of the details.
14827///
14828/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
14829/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
14830/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
14831/// vector, form the analogous 128-bit 8-element Mask.
14832static SDValue lowerV8I16GeneralSingleInputShuffle(
14833 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
14834 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14835 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14835, __extension__ __PRETTY_FUNCTION__))
;
14836 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
14837
14838 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14838, __extension__ __PRETTY_FUNCTION__))
;
14839 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
14840 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
14841
14842 // Attempt to directly match PSHUFLW or PSHUFHW.
14843 if (isUndefOrInRange(LoMask, 0, 4) &&
14844 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
14845 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14846 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14847 }
14848 if (isUndefOrInRange(HiMask, 4, 8) &&
14849 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
14850 for (int i = 0; i != 4; ++i)
14851 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14852 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14853 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14854 }
14855
14856 SmallVector<int, 4> LoInputs;
14857 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
14858 array_pod_sort(LoInputs.begin(), LoInputs.end());
14859 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
14860 SmallVector<int, 4> HiInputs;
14861 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
14862 array_pod_sort(HiInputs.begin(), HiInputs.end());
14863 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
14864 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
14865 int NumHToL = LoInputs.size() - NumLToL;
14866 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
14867 int NumHToH = HiInputs.size() - NumLToH;
14868 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
14869 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
14870 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
14871 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
14872
14873 // If we are shuffling values from one half - check how many different DWORD
14874 // pairs we need to create. If only 1 or 2 then we can perform this as a
14875 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
14876 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
14877 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
14878 V = DAG.getNode(ShufWOp, DL, VT, V,
14879 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14880 V = DAG.getBitcast(PSHUFDVT, V);
14881 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
14882 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
14883 return DAG.getBitcast(VT, V);
14884 };
14885
14886 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14887 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14888 SmallVector<std::pair<int, int>, 4> DWordPairs;
14889 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14890
14891 // Collect the different DWORD pairs.
14892 for (int DWord = 0; DWord != 4; ++DWord) {
14893 int M0 = Mask[2 * DWord + 0];
14894 int M1 = Mask[2 * DWord + 1];
14895 M0 = (M0 >= 0 ? M0 % 4 : M0);
14896 M1 = (M1 >= 0 ? M1 % 4 : M1);
14897 if (M0 < 0 && M1 < 0)
14898 continue;
14899
14900 bool Match = false;
14901 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
14902 auto &DWordPair = DWordPairs[j];
14903 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
14904 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
14905 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
14906 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
14907 PSHUFDMask[DWord] = DOffset + j;
14908 Match = true;
14909 break;
14910 }
14911 }
14912 if (!Match) {
14913 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
14914 DWordPairs.push_back(std::make_pair(M0, M1));
14915 }
14916 }
14917
14918 if (DWordPairs.size() <= 2) {
14919 DWordPairs.resize(2, std::make_pair(-1, -1));
14920 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14921 DWordPairs[1].first, DWordPairs[1].second};
14922 if ((NumHToL + NumHToH) == 0)
14923 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14924 if ((NumLToL + NumLToH) == 0)
14925 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14926 }
14927 }
14928
14929 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
14930 // such inputs we can swap two of the dwords across the half mark and end up
14931 // with <=2 inputs to each half in each half. Once there, we can fall through
14932 // to the generic code below. For example:
14933 //
14934 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14935 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14936 //
14937 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14938 // and an existing 2-into-2 on the other half. In this case we may have to
14939 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14940 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14941 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14942 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14943 // half than the one we target for fixing) will be fixed when we re-enter this
14944 // path. We will also combine away any sequence of PSHUFD instructions that
14945 // result into a single instruction. Here is an example of the tricky case:
14946 //
14947 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14948 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14949 //
14950 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14951 //
14952 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14953 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14954 //
14955 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14956 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14957 //
14958 // The result is fine to be handled by the generic logic.
14959 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14960 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14961 int AOffset, int BOffset) {
14962 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14963, __extension__ __PRETTY_FUNCTION__))
14963 "Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14963, __extension__ __PRETTY_FUNCTION__))
;
14964 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14965, __extension__ __PRETTY_FUNCTION__))
14965 "Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14965, __extension__ __PRETTY_FUNCTION__))
;
14966 assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14967, __extension__ __PRETTY_FUNCTION__))
14967 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14967, __extension__ __PRETTY_FUNCTION__))
;
14968
14969 bool ThreeAInputs = AToAInputs.size() == 3;
14970
14971 // Compute the index of dword with only one word among the three inputs in
14972 // a half by taking the sum of the half with three inputs and subtracting
14973 // the sum of the actual three inputs. The difference is the remaining
14974 // slot.
14975 int ADWord = 0, BDWord = 0;
14976 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14977 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14978 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14979 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14980 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14981 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14982 int TripleNonInputIdx =
14983 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14984 TripleDWord = TripleNonInputIdx / 2;
14985
14986 // We use xor with one to compute the adjacent DWord to whichever one the
14987 // OneInput is in.
14988 OneInputDWord = (OneInput / 2) ^ 1;
14989
14990 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14991 // and BToA inputs. If there is also such a problem with the BToB and AToB
14992 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14993 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14994 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14995 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14996 // Compute how many inputs will be flipped by swapping these DWords. We
14997 // need
14998 // to balance this to ensure we don't form a 3-1 shuffle in the other
14999 // half.
15000 int NumFlippedAToBInputs =
15001 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
15002 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
15003 int NumFlippedBToBInputs =
15004 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
15005 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
15006 if ((NumFlippedAToBInputs == 1 &&
15007 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
15008 (NumFlippedBToBInputs == 1 &&
15009 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
15010 // We choose whether to fix the A half or B half based on whether that
15011 // half has zero flipped inputs. At zero, we may not be able to fix it
15012 // with that half. We also bias towards fixing the B half because that
15013 // will more commonly be the high half, and we have to bias one way.
15014 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
15015 ArrayRef<int> Inputs) {
15016 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
15017 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
15018 // Determine whether the free index is in the flipped dword or the
15019 // unflipped dword based on where the pinned index is. We use this bit
15020 // in an xor to conditionally select the adjacent dword.
15021 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
15022 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15023 if (IsFixIdxInput == IsFixFreeIdxInput)
15024 FixFreeIdx += 1;
15025 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
15026 assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15027, __extension__ __PRETTY_FUNCTION__))
15027 "We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15027, __extension__ __PRETTY_FUNCTION__))
;
15028 int PSHUFHalfMask[] = {0, 1, 2, 3};
15029 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
15030 V = DAG.getNode(
15031 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
15032 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
15033 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
15034
15035 for (int &M : Mask)
15036 if (M >= 0 && M == FixIdx)
15037 M = FixFreeIdx;
15038 else if (M >= 0 && M == FixFreeIdx)
15039 M = FixIdx;
15040 };
15041 if (NumFlippedBToBInputs != 0) {
15042 int BPinnedIdx =
15043 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
15044 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
15045 } else {
15046 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15046, __extension__ __PRETTY_FUNCTION__))
;
15047 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
15048 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
15049 }
15050 }
15051 }
15052
15053 int PSHUFDMask[] = {0, 1, 2, 3};
15054 PSHUFDMask[ADWord] = BDWord;
15055 PSHUFDMask[BDWord] = ADWord;
15056 V = DAG.getBitcast(
15057 VT,
15058 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15059 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15060
15061 // Adjust the mask to match the new locations of A and B.
15062 for (int &M : Mask)
15063 if (M >= 0 && M/2 == ADWord)
15064 M = 2 * BDWord + M % 2;
15065 else if (M >= 0 && M/2 == BDWord)
15066 M = 2 * ADWord + M % 2;
15067
15068 // Recurse back into this routine to re-compute state now that this isn't
15069 // a 3 and 1 problem.
15070 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
15071 };
15072 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
15073 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
15074 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
15075 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
15076
15077 // At this point there are at most two inputs to the low and high halves from
15078 // each half. That means the inputs can always be grouped into dwords and
15079 // those dwords can then be moved to the correct half with a dword shuffle.
15080 // We use at most one low and one high word shuffle to collect these paired
15081 // inputs into dwords, and finally a dword shuffle to place them.
15082 int PSHUFLMask[4] = {-1, -1, -1, -1};
15083 int PSHUFHMask[4] = {-1, -1, -1, -1};
15084 int PSHUFDMask[4] = {-1, -1, -1, -1};
15085
15086 // First fix the masks for all the inputs that are staying in their
15087 // original halves. This will then dictate the targets of the cross-half
15088 // shuffles.
15089 auto fixInPlaceInputs =
15090 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
15091 MutableArrayRef<int> SourceHalfMask,
15092 MutableArrayRef<int> HalfMask, int HalfOffset) {
15093 if (InPlaceInputs.empty())
15094 return;
15095 if (InPlaceInputs.size() == 1) {
15096 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15097 InPlaceInputs[0] - HalfOffset;
15098 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
15099 return;
15100 }
15101 if (IncomingInputs.empty()) {
15102 // Just fix all of the in place inputs.
15103 for (int Input : InPlaceInputs) {
15104 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
15105 PSHUFDMask[Input / 2] = Input / 2;
15106 }
15107 return;
15108 }
15109
15110 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15110, __extension__ __PRETTY_FUNCTION__))
;
15111 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
15112 InPlaceInputs[0] - HalfOffset;
15113 // Put the second input next to the first so that they are packed into
15114 // a dword. We find the adjacent index by toggling the low bit.
15115 int AdjIndex = InPlaceInputs[0] ^ 1;
15116 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
15117 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
15118 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
15119 };
15120 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
15121 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
15122
15123 // Now gather the cross-half inputs and place them into a free dword of
15124 // their target half.
15125 // FIXME: This operation could almost certainly be simplified dramatically to
15126 // look more like the 3-1 fixing operation.
15127 auto moveInputsToRightHalf = [&PSHUFDMask](
15128 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
15129 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
15130 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
15131 int DestOffset) {
15132 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
15133 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
15134 };
15135 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
15136 int Word) {
15137 int LowWord = Word & ~1;
15138 int HighWord = Word | 1;
15139 return isWordClobbered(SourceHalfMask, LowWord) ||
15140 isWordClobbered(SourceHalfMask, HighWord);
15141 };
15142
15143 if (IncomingInputs.empty())
15144 return;
15145
15146 if (ExistingInputs.empty()) {
15147 // Map any dwords with inputs from them into the right half.
15148 for (int Input : IncomingInputs) {
15149 // If the source half mask maps over the inputs, turn those into
15150 // swaps and use the swapped lane.
15151 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
15152 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
15153 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
15154 Input - SourceOffset;
15155 // We have to swap the uses in our half mask in one sweep.
15156 for (int &M : HalfMask)
15157 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
15158 M = Input;
15159 else if (M == Input)
15160 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15161 } else {
15162 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15164, __extension__ __PRETTY_FUNCTION__))
15163 Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15164, __extension__ __PRETTY_FUNCTION__))
15164 "Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15164, __extension__ __PRETTY_FUNCTION__))
;
15165 }
15166 // Note that this correctly re-maps both when we do a swap and when
15167 // we observe the other side of the swap above. We rely on that to
15168 // avoid swapping the members of the input list directly.
15169 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
15170 }
15171
15172 // Map the input's dword into the correct half.
15173 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
15174 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
15175 else
15176 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15178, __extension__ __PRETTY_FUNCTION__))
15177 Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15178, __extension__ __PRETTY_FUNCTION__))
15178 "Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15178, __extension__ __PRETTY_FUNCTION__))
;
15179 }
15180
15181 // And just directly shift any other-half mask elements to be same-half
15182 // as we will have mirrored the dword containing the element into the
15183 // same position within that half.
15184 for (int &M : HalfMask)
15185 if (M >= SourceOffset && M < SourceOffset + 4) {
15186 M = M - SourceOffset + DestOffset;
15187 assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15187, __extension__ __PRETTY_FUNCTION__))
;
15188 }
15189 return;
15190 }
15191
15192 // Ensure we have the input in a viable dword of its current half. This
15193 // is particularly tricky because the original position may be clobbered
15194 // by inputs being moved and *staying* in that half.
15195 if (IncomingInputs.size() == 1) {
15196 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15197 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
15198 SourceOffset;
15199 SourceHalfMask[InputFixed - SourceOffset] =
15200 IncomingInputs[0] - SourceOffset;
15201 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
15202 InputFixed);
15203 IncomingInputs[0] = InputFixed;
15204 }
15205 } else if (IncomingInputs.size() == 2) {
15206 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
15207 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
15208 // We have two non-adjacent or clobbered inputs we need to extract from
15209 // the source half. To do this, we need to map them into some adjacent
15210 // dword slot in the source mask.
15211 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
15212 IncomingInputs[1] - SourceOffset};
15213
15214 // If there is a free slot in the source half mask adjacent to one of
15215 // the inputs, place the other input in it. We use (Index XOR 1) to
15216 // compute an adjacent index.
15217 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
15218 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
15219 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
15220 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15221 InputsFixed[1] = InputsFixed[0] ^ 1;
15222 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
15223 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
15224 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
15225 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
15226 InputsFixed[0] = InputsFixed[1] ^ 1;
15227 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
15228 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
15229 // The two inputs are in the same DWord but it is clobbered and the
15230 // adjacent DWord isn't used at all. Move both inputs to the free
15231 // slot.
15232 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
15233 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
15234 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
15235 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
15236 } else {
15237 // The only way we hit this point is if there is no clobbering
15238 // (because there are no off-half inputs to this half) and there is no
15239 // free slot adjacent to one of the inputs. In this case, we have to
15240 // swap an input with a non-input.
15241 for (int i = 0; i < 4; ++i)
15242 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15243, __extension__ __PRETTY_FUNCTION__))
15243 "We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15243, __extension__ __PRETTY_FUNCTION__))
;
15244 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15245, __extension__ __PRETTY_FUNCTION__))
15245 "Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15245, __extension__ __PRETTY_FUNCTION__))
;
15246
15247 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
15248 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
15249
15250 // We also have to update the final source mask in this case because
15251 // it may need to undo the above swap.
15252 for (int &M : FinalSourceHalfMask)
15253 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
15254 M = InputsFixed[1] + SourceOffset;
15255 else if (M == InputsFixed[1] + SourceOffset)
15256 M = (InputsFixed[0] ^ 1) + SourceOffset;
15257
15258 InputsFixed[1] = InputsFixed[0] ^ 1;
15259 }
15260
15261 // Point everything at the fixed inputs.
15262 for (int &M : HalfMask)
15263 if (M == IncomingInputs[0])
15264 M = InputsFixed[0] + SourceOffset;
15265 else if (M == IncomingInputs[1])
15266 M = InputsFixed[1] + SourceOffset;
15267
15268 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
15269 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
15270 }
15271 } else {
15272 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15272)
;
15273 }
15274
15275 // Now hoist the DWord down to the right half.
15276 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
15277 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15277, __extension__ __PRETTY_FUNCTION__))
;
15278 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
15279 for (int &M : HalfMask)
15280 for (int Input : IncomingInputs)
15281 if (M == Input)
15282 M = FreeDWord * 2 + Input % 2;
15283 };
15284 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
15285 /*SourceOffset*/ 4, /*DestOffset*/ 0);
15286 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
15287 /*SourceOffset*/ 0, /*DestOffset*/ 4);
15288
15289 // Now enact all the shuffles we've computed to move the inputs into their
15290 // target half.
15291 if (!isNoopShuffleMask(PSHUFLMask))
15292 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15293 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
15294 if (!isNoopShuffleMask(PSHUFHMask))
15295 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15296 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
15297 if (!isNoopShuffleMask(PSHUFDMask))
15298 V = DAG.getBitcast(
15299 VT,
15300 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
15301 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15302
15303 // At this point, each half should contain all its inputs, and we can then
15304 // just shuffle them into their final position.
15305 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15306, __extension__ __PRETTY_FUNCTION__))
15306 "Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15306, __extension__ __PRETTY_FUNCTION__))
;
15307 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15308, __extension__ __PRETTY_FUNCTION__))
15308 "Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15308, __extension__ __PRETTY_FUNCTION__))
;
15309
15310 // Do a half shuffle for the low mask.
15311 if (!isNoopShuffleMask(LoMask))
15312 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
15313 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
15314
15315 // Do a half shuffle with the high mask after shifting its values down.
15316 for (int &M : HiMask)
15317 if (M >= 0)
15318 M -= 4;
15319 if (!isNoopShuffleMask(HiMask))
15320 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
15321 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
15322
15323 return V;
15324}
15325
15326/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
15327/// blend if only one input is used.
15328static SDValue lowerShuffleAsBlendOfPSHUFBs(
15329 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15330 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
15331 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15332, __extension__ __PRETTY_FUNCTION__))
15332 "Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15332, __extension__ __PRETTY_FUNCTION__))
;
15333
15334 int NumBytes = VT.getSizeInBits() / 8;
15335 int Size = Mask.size();
15336 int Scale = NumBytes / Size;
15337
15338 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15339 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
15340 V1InUse = false;
15341 V2InUse = false;
15342
15343 for (int i = 0; i < NumBytes; ++i) {
15344 int M = Mask[i / Scale];
15345 if (M < 0)
15346 continue;
15347
15348 const int ZeroMask = 0x80;
15349 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
15350 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
15351 if (Zeroable[i / Scale])
15352 V1Idx = V2Idx = ZeroMask;
15353
15354 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
15355 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
15356 V1InUse |= (ZeroMask != V1Idx);
15357 V2InUse |= (ZeroMask != V2Idx);
15358 }
15359
15360 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
15361 if (V1InUse)
15362 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
15363 DAG.getBuildVector(ShufVT, DL, V1Mask));
15364 if (V2InUse)
15365 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
15366 DAG.getBuildVector(ShufVT, DL, V2Mask));
15367
15368 // If we need shuffled inputs from both, blend the two.
15369 SDValue V;
15370 if (V1InUse && V2InUse)
15371 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
15372 else
15373 V = V1InUse ? V1 : V2;
15374
15375 // Cast the result back to the correct type.
15376 return DAG.getBitcast(VT, V);
15377}
15378
15379/// Generic lowering of 8-lane i16 shuffles.
15380///
15381/// This handles both single-input shuffles and combined shuffle/blends with
15382/// two inputs. The single input shuffles are immediately delegated to
15383/// a dedicated lowering routine.
15384///
15385/// The blends are lowered in one of three fundamental ways. If there are few
15386/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
15387/// of the input is significantly cheaper when lowered as an interleaving of
15388/// the two inputs, try to interleave them. Otherwise, blend the low and high
15389/// halves of the inputs separately (making them have relatively few inputs)
15390/// and then concatenate them.
15391static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15392 const APInt &Zeroable, SDValue V1, SDValue V2,
15393 const X86Subtarget &Subtarget,
15394 SelectionDAG &DAG) {
15395 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15395, __extension__ __PRETTY_FUNCTION__))
;
15396 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15396, __extension__ __PRETTY_FUNCTION__))
;
15397 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15397, __extension__ __PRETTY_FUNCTION__))
;
15398
15399 // Whenever we can lower this as a zext, that instruction is strictly faster
15400 // than any alternative.
15401 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
15402 Zeroable, Subtarget, DAG))
15403 return ZExt;
15404
15405 // Try to use lower using a truncation.
15406 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15407 Subtarget, DAG))
15408 return V;
15409
15410 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
15411
15412 if (NumV2Inputs == 0) {
15413 // Try to use shift instructions.
15414 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
15415 Zeroable, Subtarget, DAG))
15416 return Shift;
15417
15418 // Check for being able to broadcast a single element.
15419 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
15420 Mask, Subtarget, DAG))
15421 return Broadcast;
15422
15423 // Try to use bit rotation instructions.
15424 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
15425 Subtarget, DAG))
15426 return Rotate;
15427
15428 // Use dedicated unpack instructions for masks that match their pattern.
15429 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15430 return V;
15431
15432 // Use dedicated pack instructions for masks that match their pattern.
15433 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15434 Subtarget))
15435 return V;
15436
15437 // Try to use byte rotation instructions.
15438 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
15439 Subtarget, DAG))
15440 return Rotate;
15441
15442 // Make a copy of the mask so it can be modified.
15443 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
15444 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
15445 Subtarget, DAG);
15446 }
15447
15448 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15450, __extension__ __PRETTY_FUNCTION__))
15449 "All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15450, __extension__ __PRETTY_FUNCTION__))
15450 "shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15450, __extension__ __PRETTY_FUNCTION__))
;
15451
15452 // Try to use shift instructions.
15453 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
15454 Zeroable, Subtarget, DAG))
15455 return Shift;
15456
15457 // See if we can use SSE4A Extraction / Insertion.
15458 if (Subtarget.hasSSE4A())
15459 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
15460 Zeroable, DAG))
15461 return V;
15462
15463 // There are special ways we can lower some single-element blends.
15464 if (NumV2Inputs == 1)
15465 if (SDValue V = lowerShuffleAsElementInsertion(
15466 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15467 return V;
15468
15469 // We have different paths for blend lowering, but they all must use the
15470 // *exact* same predicate.
15471 bool IsBlendSupported = Subtarget.hasSSE41();
15472 if (IsBlendSupported)
15473 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
15474 Zeroable, Subtarget, DAG))
15475 return Blend;
15476
15477 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
15478 Zeroable, Subtarget, DAG))
15479 return Masked;
15480
15481 // Use dedicated unpack instructions for masks that match their pattern.
15482 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
15483 return V;
15484
15485 // Use dedicated pack instructions for masks that match their pattern.
15486 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
15487 Subtarget))
15488 return V;
15489
15490 // Try to use lower using a truncation.
15491 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
15492 Subtarget, DAG))
15493 return V;
15494
15495 // Try to use byte rotation instructions.
15496 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
15497 Subtarget, DAG))
15498 return Rotate;
15499
15500 if (SDValue BitBlend =
15501 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
15502 return BitBlend;
15503
15504 // Try to use byte shift instructions to mask.
15505 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
15506 Zeroable, Subtarget, DAG))
15507 return V;
15508
15509 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
15510 // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
15511 // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
15512 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
15513 if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
15514 !Subtarget.hasVLX()) {
15515 SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
15516 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15517 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
15518 SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
15519 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
15520 DWordClearMask);
15521 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
15522 DWordClearMask);
15523 // Now pack things back together.
15524 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
15525 if (NumEvenDrops == 2) {
15526 Result = DAG.getBitcast(MVT::v4i32, Result);
15527 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
15528 }
15529 return Result;
15530 }
15531
15532 // Try to lower by permuting the inputs into an unpack instruction.
15533 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
15534 Mask, Subtarget, DAG))
15535 return Unpack;
15536
15537 // If we can't directly blend but can use PSHUFB, that will be better as it
15538 // can both shuffle and set up the inefficient blend.
15539 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
15540 bool V1InUse, V2InUse;
15541 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
15542 Zeroable, DAG, V1InUse, V2InUse);
15543 }
15544
15545 // We can always bit-blend if we have to so the fallback strategy is to
15546 // decompose into single-input permutes and blends/unpacks.
15547 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
15548 Mask, Subtarget, DAG);
15549}
15550
15551/// Lower 8-lane 16-bit floating point shuffles.
15552static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15553 const APInt &Zeroable, SDValue V1, SDValue V2,
15554 const X86Subtarget &Subtarget,
15555 SelectionDAG &DAG) {
15556 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15556, __extension__ __PRETTY_FUNCTION__))
;
15557 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15557, __extension__ __PRETTY_FUNCTION__))
;
15558 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15558, __extension__ __PRETTY_FUNCTION__))
;
15559 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
15560
15561 if (NumV2Elements == 0) {
15562 // Check for being able to broadcast a single element.
15563 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
15564 Mask, Subtarget, DAG))
15565 return Broadcast;
15566 }
15567 if (NumV2Elements == 1 && Mask[0] >= 8)
15568 if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8f16, V1, V2, Mask,
15569 Zeroable, Subtarget, DAG))
15570 return V;
15571
15572 V1 = DAG.getBitcast(MVT::v8i16, V1);
15573 V2 = DAG.getBitcast(MVT::v8i16, V2);
15574 return DAG.getBitcast(MVT::v8f16,
15575 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15576}
15577
15578// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
15579// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
15580// the active subvector is extracted.
15581static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
15582 ArrayRef<int> Mask, SDValue V1, SDValue V2,
15583 const X86Subtarget &Subtarget,
15584 SelectionDAG &DAG) {
15585 MVT MaskVT = VT.changeTypeToInteger();
15586 SDValue MaskNode;
15587 MVT ShuffleVT = VT;
15588 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
15589 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
15590 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
15591 ShuffleVT = V1.getSimpleValueType();
15592
15593 // Adjust mask to correct indices for the second input.
15594 int NumElts = VT.getVectorNumElements();
15595 unsigned Scale = 512 / VT.getSizeInBits();
15596 SmallVector<int, 32> AdjustedMask(Mask.begin(), Mask.end());
15597 for (int &M : AdjustedMask)
15598 if (NumElts <= M)
15599 M += (Scale - 1) * NumElts;
15600 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
15601 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
15602 } else {
15603 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
15604 }
15605
15606 SDValue Result;
15607 if (V2.isUndef())
15608 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
15609 else
15610 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
15611
15612 if (VT != ShuffleVT)
15613 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
15614
15615 return Result;
15616}
15617
15618/// Generic lowering of v16i8 shuffles.
15619///
15620/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
15621/// detect any complexity reducing interleaving. If that doesn't help, it uses
15622/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
15623/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
15624/// back together.
15625static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15626 const APInt &Zeroable, SDValue V1, SDValue V2,
15627 const X86Subtarget &Subtarget,
15628 SelectionDAG &DAG) {
15629 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15629, __extension__ __PRETTY_FUNCTION__))
;
15630 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15630, __extension__ __PRETTY_FUNCTION__))
;
15631 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15631, __extension__ __PRETTY_FUNCTION__))
;
15632
15633 // Try to use shift instructions.
15634 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
15635 Zeroable, Subtarget, DAG))
15636 return Shift;
15637
15638 // Try to use byte rotation instructions.
15639 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
15640 Subtarget, DAG))
15641 return Rotate;
15642
15643 // Use dedicated pack instructions for masks that match their pattern.
15644 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
15645 Subtarget))
15646 return V;
15647
15648 // Try to use a zext lowering.
15649 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
15650 Zeroable, Subtarget, DAG))
15651 return ZExt;
15652
15653 // Try to use lower using a truncation.
15654 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15655 Subtarget, DAG))
15656 return V;
15657
15658 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
15659 Subtarget, DAG))
15660 return V;
15661
15662 // See if we can use SSE4A Extraction / Insertion.
15663 if (Subtarget.hasSSE4A())
15664 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
15665 Zeroable, DAG))
15666 return V;
15667
15668 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
15669
15670 // For single-input shuffles, there are some nicer lowering tricks we can use.
15671 if (NumV2Elements == 0) {
15672 // Check for being able to broadcast a single element.
15673 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
15674 Mask, Subtarget, DAG))
15675 return Broadcast;
15676
15677 // Try to use bit rotation instructions.
15678 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
15679 Subtarget, DAG))
15680 return Rotate;
15681
15682 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15683 return V;
15684
15685 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
15686 // Notably, this handles splat and partial-splat shuffles more efficiently.
15687 // However, it only makes sense if the pre-duplication shuffle simplifies
15688 // things significantly. Currently, this means we need to be able to
15689 // express the pre-duplication shuffle as an i16 shuffle.
15690 //
15691 // FIXME: We should check for other patterns which can be widened into an
15692 // i16 shuffle as well.
15693 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
15694 for (int i = 0; i < 16; i += 2)
15695 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15696 return false;
15697
15698 return true;
15699 };
15700 auto tryToWidenViaDuplication = [&]() -> SDValue {
15701 if (!canWidenViaDuplication(Mask))
15702 return SDValue();
15703 SmallVector<int, 4> LoInputs;
15704 copy_if(Mask, std::back_inserter(LoInputs),
15705 [](int M) { return M >= 0 && M < 8; });
15706 array_pod_sort(LoInputs.begin(), LoInputs.end());
15707 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
15708 LoInputs.end());
15709 SmallVector<int, 4> HiInputs;
15710 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
15711 array_pod_sort(HiInputs.begin(), HiInputs.end());
15712 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
15713 HiInputs.end());
15714
15715 bool TargetLo = LoInputs.size() >= HiInputs.size();
15716 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15717 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15718
15719 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15720 SmallDenseMap<int, int, 8> LaneMap;
15721 for (int I : InPlaceInputs) {
15722 PreDupI16Shuffle[I/2] = I/2;
15723 LaneMap[I] = I;
15724 }
15725 int j = TargetLo ? 0 : 4, je = j + 4;
15726 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
15727 // Check if j is already a shuffle of this input. This happens when
15728 // there are two adjacent bytes after we move the low one.
15729 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15730 // If we haven't yet mapped the input, search for a slot into which
15731 // we can map it.
15732 while (j < je && PreDupI16Shuffle[j] >= 0)
15733 ++j;
15734
15735 if (j == je)
15736 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
15737 return SDValue();
15738
15739 // Map this input with the i16 shuffle.
15740 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15741 }
15742
15743 // Update the lane map based on the mapping we ended up with.
15744 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15745 }
15746 V1 = DAG.getBitcast(
15747 MVT::v16i8,
15748 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15749 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15750
15751 // Unpack the bytes to form the i16s that will be shuffled into place.
15752 bool EvenInUse = false, OddInUse = false;
15753 for (int i = 0; i < 16; i += 2) {
15754 EvenInUse |= (Mask[i + 0] >= 0);
15755 OddInUse |= (Mask[i + 1] >= 0);
15756 if (EvenInUse && OddInUse)
15757 break;
15758 }
15759 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
15760 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
15761 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
15762
15763 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15764 for (int i = 0; i < 16; ++i)
15765 if (Mask[i] >= 0) {
15766 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15767 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15767, __extension__ __PRETTY_FUNCTION__))
;
15768 if (PostDupI16Shuffle[i / 2] < 0)
15769 PostDupI16Shuffle[i / 2] = MappedMask;
15770 else
15771 assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15772, __extension__ __PRETTY_FUNCTION__))
15772 "Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15772, __extension__ __PRETTY_FUNCTION__))
;
15773 }
15774 return DAG.getBitcast(
15775 MVT::v16i8,
15776 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
15777 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15778 };
15779 if (SDValue V = tryToWidenViaDuplication())
15780 return V;
15781 }
15782
15783 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
15784 Zeroable, Subtarget, DAG))
15785 return Masked;
15786
15787 // Use dedicated unpack instructions for masks that match their pattern.
15788 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
15789 return V;
15790
15791 // Try to use byte shift instructions to mask.
15792 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
15793 Zeroable, Subtarget, DAG))
15794 return V;
15795
15796 // Check for compaction patterns.
15797 bool IsSingleInput = V2.isUndef();
15798 int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
15799
15800 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
15801 // with PSHUFB. It is important to do this before we attempt to generate any
15802 // blends but after all of the single-input lowerings. If the single input
15803 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
15804 // want to preserve that and we can DAG combine any longer sequences into
15805 // a PSHUFB in the end. But once we start blending from multiple inputs,
15806 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
15807 // and there are *very* few patterns that would actually be faster than the
15808 // PSHUFB approach because of its ability to zero lanes.
15809 //
15810 // If the mask is a binary compaction, we can more efficiently perform this
15811 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
15812 //
15813 // FIXME: The only exceptions to the above are blends which are exact
15814 // interleavings with direct instructions supporting them. We currently don't
15815 // handle those well here.
15816 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15817 bool V1InUse = false;
15818 bool V2InUse = false;
15819
15820 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
15821 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15822
15823 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
15824 // do so. This avoids using them to handle blends-with-zero which is
15825 // important as a single pshufb is significantly faster for that.
15826 if (V1InUse && V2InUse) {
15827 if (Subtarget.hasSSE41())
15828 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
15829 Zeroable, Subtarget, DAG))
15830 return Blend;
15831
15832 // We can use an unpack to do the blending rather than an or in some
15833 // cases. Even though the or may be (very minorly) more efficient, we
15834 // preference this lowering because there are common cases where part of
15835 // the complexity of the shuffles goes away when we do the final blend as
15836 // an unpack.
15837 // FIXME: It might be worth trying to detect if the unpack-feeding
15838 // shuffles will both be pshufb, in which case we shouldn't bother with
15839 // this.
15840 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
15841 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15842 return Unpack;
15843
15844 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
15845 if (Subtarget.hasVBMI())
15846 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
15847 DAG);
15848
15849 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
15850 if (Subtarget.hasXOP()) {
15851 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
15852 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
15853 }
15854
15855 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
15856 // PALIGNR will be cheaper than the second PSHUFB+OR.
15857 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
15858 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15859 return V;
15860 }
15861
15862 return PSHUFB;
15863 }
15864
15865 // There are special ways we can lower some single-element blends.
15866 if (NumV2Elements == 1)
15867 if (SDValue V = lowerShuffleAsElementInsertion(
15868 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15869 return V;
15870
15871 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
15872 return Blend;
15873
15874 // Check whether a compaction lowering can be done. This handles shuffles
15875 // which take every Nth element for some even N. See the helper function for
15876 // details.
15877 //
15878 // We special case these as they can be particularly efficiently handled with
15879 // the PACKUSB instruction on x86 and they show up in common patterns of
15880 // rearranging bytes to truncate wide elements.
15881 if (NumEvenDrops) {
15882 // NumEvenDrops is the power of two stride of the elements. Another way of
15883 // thinking about it is that we need to drop the even elements this many
15884 // times to get the original input.
15885
15886 // First we need to zero all the dropped bytes.
15887 assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15888, __extension__ __PRETTY_FUNCTION__))
15888 "No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15888, __extension__ __PRETTY_FUNCTION__))
;
15889 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15890 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15891 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15892 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15893 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15894 WordClearMask);
15895 if (!IsSingleInput)
15896 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15897 WordClearMask);
15898
15899 // Now pack things back together.
15900 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15901 IsSingleInput ? V1 : V2);
15902 for (int i = 1; i < NumEvenDrops; ++i) {
15903 Result = DAG.getBitcast(MVT::v8i16, Result);
15904 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15905 }
15906 return Result;
15907 }
15908
15909 // Handle multi-input cases by blending/unpacking single-input shuffles.
15910 if (NumV2Elements > 0)
15911 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15912 Subtarget, DAG);
15913
15914 // The fallback path for single-input shuffles widens this into two v8i16
15915 // vectors with unpacks, shuffles those, and then pulls them back together
15916 // with a pack.
15917 SDValue V = V1;
15918
15919 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15920 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15921 for (int i = 0; i < 16; ++i)
15922 if (Mask[i] >= 0)
15923 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15924
15925 SDValue VLoHalf, VHiHalf;
15926 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15927 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15928 // i16s.
15929 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15930 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15931 // Use a mask to drop the high bytes.
15932 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15933 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15934 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15935
15936 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15937 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15938
15939 // Squash the masks to point directly into VLoHalf.
15940 for (int &M : LoBlendMask)
15941 if (M >= 0)
15942 M /= 2;
15943 for (int &M : HiBlendMask)
15944 if (M >= 0)
15945 M /= 2;
15946 } else {
15947 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15948 // VHiHalf so that we can blend them as i16s.
15949 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15950
15951 VLoHalf = DAG.getBitcast(
15952 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15953 VHiHalf = DAG.getBitcast(
15954 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15955 }
15956
15957 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15958 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15959
15960 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15961}
15962
15963/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15964///
15965/// This routine breaks down the specific type of 128-bit shuffle and
15966/// dispatches to the lowering routines accordingly.
15967static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
15968 MVT VT, SDValue V1, SDValue V2,
15969 const APInt &Zeroable,
15970 const X86Subtarget &Subtarget,
15971 SelectionDAG &DAG) {
15972 switch (VT.SimpleTy) {
15973 case MVT::v2i64:
15974 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15975 case MVT::v2f64:
15976 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15977 case MVT::v4i32:
15978 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15979 case MVT::v4f32:
15980 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15981 case MVT::v8i16:
15982 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15983 case MVT::v8f16:
15984 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15985 case MVT::v16i8:
15986 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15987
15988 default:
15989 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15989)
;
15990 }
15991}
15992
15993/// Generic routine to split vector shuffle into half-sized shuffles.
15994///
15995/// This routine just extracts two subvectors, shuffles them independently, and
15996/// then concatenates them back together. This should work effectively with all
15997/// AVX vector shuffle types.
15998static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
15999 SDValue V2, ArrayRef<int> Mask,
16000 SelectionDAG &DAG) {
16001 assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16002, __extension__ __PRETTY_FUNCTION__))
16002 "Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16002, __extension__ __PRETTY_FUNCTION__))
;
16003 assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16003, __extension__ __PRETTY_FUNCTION__))
;
16004 assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16004, __extension__ __PRETTY_FUNCTION__))
;
16005
16006 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
16007 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
16008
16009 int NumElements = VT.getVectorNumElements();
16010 int SplitNumElements = NumElements / 2;
16011 MVT ScalarVT = VT.getVectorElementType();
16012 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
16013
16014 // Use splitVector/extractSubVector so that split build-vectors just build two
16015 // narrower build vectors. This helps shuffling with splats and zeros.
16016 auto SplitVector = [&](SDValue V) {
16017 SDValue LoV, HiV;
16018 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
16019 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
16020 DAG.getBitcast(SplitVT, HiV));
16021 };
16022
16023 SDValue LoV1, HiV1, LoV2, HiV2;
16024 std::tie(LoV1, HiV1) = SplitVector(V1);
16025 std::tie(LoV2, HiV2) = SplitVector(V2);
16026
16027 // Now create two 4-way blends of these half-width vectors.
16028 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16029 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
16030 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16031 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16032 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16033 for (int i = 0; i < SplitNumElements; ++i) {
16034 int M = HalfMask[i];
16035 if (M >= NumElements) {
16036 if (M >= NumElements + SplitNumElements)
16037 UseHiV2 = true;
16038 else
16039 UseLoV2 = true;
16040 V2BlendMask[i] = M - NumElements;
16041 BlendMask[i] = SplitNumElements + i;
16042 } else if (M >= 0) {
16043 if (M >= SplitNumElements)
16044 UseHiV1 = true;
16045 else
16046 UseLoV1 = true;
16047 V1BlendMask[i] = M;
16048 BlendMask[i] = i;
16049 }
16050 }
16051
16052 // Because the lowering happens after all combining takes place, we need to
16053 // manually combine these blend masks as much as possible so that we create
16054 // a minimal number of high-level vector shuffle nodes.
16055
16056 // First try just blending the halves of V1 or V2.
16057 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
16058 return DAG.getUNDEF(SplitVT);
16059 if (!UseLoV2 && !UseHiV2)
16060 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16061 if (!UseLoV1 && !UseHiV1)
16062 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16063
16064 SDValue V1Blend, V2Blend;
16065 if (UseLoV1 && UseHiV1) {
16066 V1Blend =
16067 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16068 } else {
16069 // We only use half of V1 so map the usage down into the final blend mask.
16070 V1Blend = UseLoV1 ? LoV1 : HiV1;
16071 for (int i = 0; i < SplitNumElements; ++i)
16072 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
16073 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
16074 }
16075 if (UseLoV2 && UseHiV2) {
16076 V2Blend =
16077 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16078 } else {
16079 // We only use half of V2 so map the usage down into the final blend mask.
16080 V2Blend = UseLoV2 ? LoV2 : HiV2;
16081 for (int i = 0; i < SplitNumElements; ++i)
16082 if (BlendMask[i] >= SplitNumElements)
16083 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
16084 }
16085 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
16086 };
16087 SDValue Lo = HalfBlend(LoMask);
16088 SDValue Hi = HalfBlend(HiMask);
16089 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16090}
16091
16092/// Either split a vector in halves or decompose the shuffles and the
16093/// blend/unpack.
16094///
16095/// This is provided as a good fallback for many lowerings of non-single-input
16096/// shuffles with more than one 128-bit lane. In those cases, we want to select
16097/// between splitting the shuffle into 128-bit components and stitching those
16098/// back together vs. extracting the single-input shuffles and blending those
16099/// results.
16100static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
16101 SDValue V2, ArrayRef<int> Mask,
16102 const X86Subtarget &Subtarget,
16103 SelectionDAG &DAG) {
16104 assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16105, __extension__ __PRETTY_FUNCTION__))
16105 "shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16105, __extension__ __PRETTY_FUNCTION__))
;
16106 int Size = Mask.size();
16107
16108 // If this can be modeled as a broadcast of two elements followed by a blend,
16109 // prefer that lowering. This is especially important because broadcasts can
16110 // often fold with memory operands.
16111 auto DoBothBroadcast = [&] {
16112 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
16113 for (int M : Mask)
16114 if (M >= Size) {
16115 if (V2BroadcastIdx < 0)
16116 V2BroadcastIdx = M - Size;
16117 else if (M - Size != V2BroadcastIdx)
16118 return false;
16119 } else if (M >= 0) {
16120 if (V1BroadcastIdx < 0)
16121 V1BroadcastIdx = M;
16122 else if (M != V1BroadcastIdx)
16123 return false;
16124 }
16125 return true;
16126 };
16127 if (DoBothBroadcast())
16128 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16129 DAG);
16130
16131 // If the inputs all stem from a single 128-bit lane of each input, then we
16132 // split them rather than blending because the split will decompose to
16133 // unusually few instructions.
16134 int LaneCount = VT.getSizeInBits() / 128;
16135 int LaneSize = Size / LaneCount;
16136 SmallBitVector LaneInputs[2];
16137 LaneInputs[0].resize(LaneCount, false);
16138 LaneInputs[1].resize(LaneCount, false);
16139 for (int i = 0; i < Size; ++i)
16140 if (Mask[i] >= 0)
16141 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
16142 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
16143 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16144
16145 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
16146 // requires that the decomposed single-input shuffles don't end up here.
16147 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
16148 DAG);
16149}
16150
16151// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16152// TODO: Extend to support v8f32 (+ 512-bit shuffles).
16153static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
16154 SDValue V1, SDValue V2,
16155 ArrayRef<int> Mask,
16156 SelectionDAG &DAG) {
16157 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16157, __extension__ __PRETTY_FUNCTION__))
;
16158
16159 int LHSMask[4] = {-1, -1, -1, -1};
16160 int RHSMask[4] = {-1, -1, -1, -1};
16161 unsigned SHUFPMask = 0;
16162
16163 // As SHUFPD uses a single LHS/RHS element per lane, we can always
16164 // perform the shuffle once the lanes have been shuffled in place.
16165 for (int i = 0; i != 4; ++i) {
16166 int M = Mask[i];
16167 if (M < 0)
16168 continue;
16169 int LaneBase = i & ~1;
16170 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
16171 LaneMask[LaneBase + (M & 1)] = M;
16172 SHUFPMask |= (M & 1) << i;
16173 }
16174
16175 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
16176 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
16177 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
16178 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
16179}
16180
16181/// Lower a vector shuffle crossing multiple 128-bit lanes as
16182/// a lane permutation followed by a per-lane permutation.
16183///
16184/// This is mainly for cases where we can have non-repeating permutes
16185/// in each lane.
16186///
16187/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
16188/// we should investigate merging them.
16189static SDValue lowerShuffleAsLanePermuteAndPermute(
16190 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16191 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16192 int NumElts = VT.getVectorNumElements();
16193 int NumLanes = VT.getSizeInBits() / 128;
16194 int NumEltsPerLane = NumElts / NumLanes;
16195 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
16196
16197 /// Attempts to find a sublane permute with the given size
16198 /// that gets all elements into their target lanes.
16199 ///
16200 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
16201 /// If unsuccessful, returns false and may overwrite InLaneMask.
16202 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
16203 int NumSublanesPerLane = NumSublanes / NumLanes;
16204 int NumEltsPerSublane = NumElts / NumSublanes;
16205
16206 SmallVector<int, 16> CrossLaneMask;
16207 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
16208 // CrossLaneMask but one entry == one sublane.
16209 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
16210
16211 for (int i = 0; i != NumElts; ++i) {
16212 int M = Mask[i];
16213 if (M < 0)
16214 continue;
16215
16216 int SrcSublane = M / NumEltsPerSublane;
16217 int DstLane = i / NumEltsPerLane;
16218
16219 // We only need to get the elements into the right lane, not sublane.
16220 // So search all sublanes that make up the destination lane.
16221 bool Found = false;
16222 int DstSubStart = DstLane * NumSublanesPerLane;
16223 int DstSubEnd = DstSubStart + NumSublanesPerLane;
16224 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
16225 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
16226 continue;
16227
16228 Found = true;
16229 CrossLaneMaskLarge[DstSublane] = SrcSublane;
16230 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
16231 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
16232 break;
16233 }
16234 if (!Found)
16235 return SDValue();
16236 }
16237
16238 // Fill CrossLaneMask using CrossLaneMaskLarge.
16239 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
16240
16241 if (!CanUseSublanes) {
16242 // If we're only shuffling a single lowest lane and the rest are identity
16243 // then don't bother.
16244 // TODO - isShuffleMaskInputInPlace could be extended to something like
16245 // this.
16246 int NumIdentityLanes = 0;
16247 bool OnlyShuffleLowestLane = true;
16248 for (int i = 0; i != NumLanes; ++i) {
16249 int LaneOffset = i * NumEltsPerLane;
16250 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
16251 i * NumEltsPerLane))
16252 NumIdentityLanes++;
16253 else if (CrossLaneMask[LaneOffset] != 0)
16254 OnlyShuffleLowestLane = false;
16255 }
16256 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
16257 return SDValue();
16258 }
16259
16260 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
16261 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
16262 InLaneMask);
16263 };
16264
16265 // First attempt a solution with full lanes.
16266 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
16267 return V;
16268
16269 // The rest of the solutions use sublanes.
16270 if (!CanUseSublanes)
16271 return SDValue();
16272
16273 // Then attempt a solution with 64-bit sublanes (vpermq).
16274 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
16275 return V;
16276
16277 // If that doesn't work and we have fast variable cross-lane shuffle,
16278 // attempt 32-bit sublanes (vpermd).
16279 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16280 return SDValue();
16281
16282 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
16283}
16284
16285/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
16286/// source with a lane permutation.
16287///
16288/// This lowering strategy results in four instructions in the worst case for a
16289/// single-input cross lane shuffle which is lower than any other fully general
16290/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
16291/// shuffle pattern should be handled prior to trying this lowering.
16292static SDValue lowerShuffleAsLanePermuteAndShuffle(
16293 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16294 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
16295 // FIXME: This should probably be generalized for 512-bit vectors as well.
16296 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16296, __extension__ __PRETTY_FUNCTION__))
;
16297 int Size = Mask.size();
16298 int LaneSize = Size / 2;
16299
16300 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16301 // Only do this if the elements aren't all from the lower lane,
16302 // otherwise we're (probably) better off doing a split.
16303 if (VT == MVT::v4f64 &&
16304 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
16305 if (SDValue V =
16306 lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
16307 return V;
16308
16309 // If there are only inputs from one 128-bit lane, splitting will in fact be
16310 // less expensive. The flags track whether the given lane contains an element
16311 // that crosses to another lane.
16312 if (!Subtarget.hasAVX2()) {
16313 bool LaneCrossing[2] = {false, false};
16314 for (int i = 0; i < Size; ++i)
16315 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
16316 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
16317 if (!LaneCrossing[0] || !LaneCrossing[1])
16318 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16319 } else {
16320 bool LaneUsed[2] = {false, false};
16321 for (int i = 0; i < Size; ++i)
16322 if (Mask[i] >= 0)
16323 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
16324 if (!LaneUsed[0] || !LaneUsed[1])
16325 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16326 }
16327
16328 // TODO - we could support shuffling V2 in the Flipped input.
16329 assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16330, __extension__ __PRETTY_FUNCTION__))
16330 "This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16330, __extension__ __PRETTY_FUNCTION__))
;
16331
16332 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
16333 for (int i = 0; i < Size; ++i) {
16334 int &M = InLaneMask[i];
16335 if (M < 0)
16336 continue;
16337 if (((M % Size) / LaneSize) != (i / LaneSize))
16338 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
16339 }
16340 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16341, __extension__ __PRETTY_FUNCTION__))
16341 "In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16341, __extension__ __PRETTY_FUNCTION__))
;
16342
16343 // Flip the lanes, and shuffle the results which should now be in-lane.
16344 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
16345 SDValue Flipped = DAG.getBitcast(PVT, V1);
16346 Flipped =
16347 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
16348 Flipped = DAG.getBitcast(VT, Flipped);
16349 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
16350}
16351
16352/// Handle lowering 2-lane 128-bit shuffles.
16353static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
16354 SDValue V2, ArrayRef<int> Mask,
16355 const APInt &Zeroable,
16356 const X86Subtarget &Subtarget,
16357 SelectionDAG &DAG) {
16358 if (V2.isUndef()) {
16359 // Attempt to match VBROADCAST*128 subvector broadcast load.
16360 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
16361 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
16362 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
16363 MayFoldLoad(peekThroughOneUseBitcasts(V1))) {
16364 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
16365 if (!Ld->isNonTemporal()) {
16366 MVT MemVT = VT.getHalfNumVectorElementsVT();
16367 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
16368 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
16369 SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
16370 TypeSize::Fixed(Ofs), DL);
16371 SDValue Ops[] = {Ld->getChain(), Ptr};
16372 SDValue BcastLd = DAG.getMemIntrinsicNode(
16373 X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT,
16374 DAG.getMachineFunction().getMachineMemOperand(
16375 Ld->getMemOperand(), Ofs, MemVT.getStoreSize()));
16376 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
16377 return BcastLd;
16378 }
16379 }
16380
16381 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
16382 if (Subtarget.hasAVX2())
16383 return SDValue();
16384 }
16385
16386 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
16387
16388 SmallVector<int, 4> WidenedMask;
16389 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
16390 return SDValue();
16391
16392 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16393 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16394
16395 // Try to use an insert into a zero vector.
16396 if (WidenedMask[0] == 0 && IsHighZero) {
16397 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16398 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16399 DAG.getIntPtrConstant(0, DL));
16400 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16401 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16402 DAG.getIntPtrConstant(0, DL));
16403 }
16404
16405 // TODO: If minimizing size and one of the inputs is a zero vector and the
16406 // the zero vector has only one use, we could use a VPERM2X128 to save the
16407 // instruction bytes needed to explicitly generate the zero vector.
16408
16409 // Blends are faster and handle all the non-lane-crossing cases.
16410 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
16411 Subtarget, DAG))
16412 return Blend;
16413
16414 // If either input operand is a zero vector, use VPERM2X128 because its mask
16415 // allows us to replace the zero input with an implicit zero.
16416 if (!IsLowZero && !IsHighZero) {
16417 // Check for patterns which can be matched with a single insert of a 128-bit
16418 // subvector.
16419 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
16420 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
16421
16422 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
16423 // this will likely become vinsertf128 which can't fold a 256-bit memop.
16424 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
16425 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16426 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16427 OnlyUsesV1 ? V1 : V2,
16428 DAG.getIntPtrConstant(0, DL));
16429 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16430 DAG.getIntPtrConstant(2, DL));
16431 }
16432 }
16433
16434 // Try to use SHUF128 if possible.
16435 if (Subtarget.hasVLX()) {
16436 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16437 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16438 ((WidenedMask[1] % 2) << 1);
16439 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
16440 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16441 }
16442 }
16443 }
16444
16445 // Otherwise form a 128-bit permutation. After accounting for undefs,
16446 // convert the 64-bit shuffle mask selection values into 128-bit
16447 // selection bits by dividing the indexes by 2 and shifting into positions
16448 // defined by a vperm2*128 instruction's immediate control byte.
16449
16450 // The immediate permute control byte looks like this:
16451 // [1:0] - select 128 bits from sources for low half of destination
16452 // [2] - ignore
16453 // [3] - zero low half of destination
16454 // [5:4] - select 128 bits from sources for high half of destination
16455 // [6] - ignore
16456 // [7] - zero high half of destination
16457
16458 assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16459, __extension__ __PRETTY_FUNCTION__))
16459 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16459, __extension__ __PRETTY_FUNCTION__))
;
16460
16461 unsigned PermMask = 0;
16462 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16463 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16464
16465 // Check the immediate mask and replace unused sources with undef.
16466 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16467 V1 = DAG.getUNDEF(VT);
16468 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16469 V2 = DAG.getUNDEF(VT);
16470
16471 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
16472 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16473}
16474
16475/// Lower a vector shuffle by first fixing the 128-bit lanes and then
16476/// shuffling each lane.
16477///
16478/// This attempts to create a repeated lane shuffle where each lane uses one
16479/// or two of the lanes of the inputs. The lanes of the input vectors are
16480/// shuffled in one or two independent shuffles to get the lanes into the
16481/// position needed by the final shuffle.
16482static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
16483 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16484 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16485 assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16485, __extension__ __PRETTY_FUNCTION__))
;
21
'?' condition is true
16486
16487 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
22
Assuming the condition is false
23
Taking false branch
16488 return SDValue();
16489
16490 int NumElts = Mask.size();
16491 int NumLanes = VT.getSizeInBits() / 128;
16492 int NumLaneElts = 128 / VT.getScalarSizeInBits();
24
'NumLaneElts' initialized here
16493 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
16494 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
16495
16496 // First pass will try to fill in the RepeatMask from lanes that need two
16497 // sources.
16498 for (int Lane = 0; Lane != NumLanes; ++Lane) {
25
Assuming 'Lane' is not equal to 'NumLanes'
26
Loop condition is true. Entering loop body
31
Assuming 'Lane' is equal to 'NumLanes'
32
Loop condition is false. Execution continues on line 16570
16499 int Srcs[2] = {-1, -1};
16500 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
16501 for (int i = 0; i != NumLaneElts; ++i) {
27
Assuming 'i' is equal to 'NumLaneElts'
28
Loop condition is false. Execution continues on line 16523
16502 int M = Mask[(Lane * NumLaneElts) + i];
16503 if (M < 0)
16504 continue;
16505 // Determine which of the possible input lanes (NumLanes from each source)
16506 // this element comes from. Assign that as one of the sources for this
16507 // lane. We can assign up to 2 sources for this lane. If we run out
16508 // sources we can't do anything.
16509 int LaneSrc = M / NumLaneElts;
16510 int Src;
16511 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16512 Src = 0;
16513 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16514 Src = 1;
16515 else
16516 return SDValue();
16517
16518 Srcs[Src] = LaneSrc;
16519 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16520 }
16521
16522 // If this lane has two sources, see if it fits with the repeat mask so far.
16523 if (Srcs[1] < 0)
29
Taking true branch
16524 continue;
30
Execution continues on line 16498
16525
16526 LaneSrcs[Lane][0] = Srcs[0];
16527 LaneSrcs[Lane][1] = Srcs[1];
16528
16529 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
16530 assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16530, __extension__ __PRETTY_FUNCTION__))
;
16531 for (int i = 0, e = M1.size(); i != e; ++i)
16532 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
16533 return false;
16534 return true;
16535 };
16536
16537 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
16538 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16538, __extension__ __PRETTY_FUNCTION__))
;
16539 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
16540 int M = Mask[i];
16541 if (M < 0)
16542 continue;
16543 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16544, __extension__ __PRETTY_FUNCTION__))
16544 "Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16544, __extension__ __PRETTY_FUNCTION__))
;
16545 MergedMask[i] = M;
16546 }
16547 };
16548
16549 if (MatchMasks(InLaneMask, RepeatMask)) {
16550 // Merge this lane mask into the final repeat mask.
16551 MergeMasks(InLaneMask, RepeatMask);
16552 continue;
16553 }
16554
16555 // Didn't find a match. Swap the operands and try again.
16556 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16557 ShuffleVectorSDNode::commuteMask(InLaneMask);
16558
16559 if (MatchMasks(InLaneMask, RepeatMask)) {
16560 // Merge this lane mask into the final repeat mask.
16561 MergeMasks(InLaneMask, RepeatMask);
16562 continue;
16563 }
16564
16565 // Couldn't find a match with the operands in either order.
16566 return SDValue();
16567 }
16568
16569 // Now handle any lanes with only one source.
16570 for (int Lane = 0; Lane != NumLanes; ++Lane) {
33
Loop condition is true. Entering loop body
37
Loop condition is false. Execution continues on line 16599
16571 // If this lane has already been processed, skip it.
16572 if (LaneSrcs[Lane][0] >= 0)
34
Assuming the condition is true
35
Taking true branch
16573 continue;
36
Execution continues on line 16570
16574
16575 for (int i = 0; i != NumLaneElts; ++i) {
16576 int M = Mask[(Lane * NumLaneElts) + i];
16577 if (M < 0)
16578 continue;
16579
16580 // If RepeatMask isn't defined yet we can define it ourself.
16581 if (RepeatMask[i] < 0)
16582 RepeatMask[i] = M % NumLaneElts;
16583
16584 if (RepeatMask[i] < NumElts) {
16585 if (RepeatMask[i] != M % NumLaneElts)
16586 return SDValue();
16587 LaneSrcs[Lane][0] = M / NumLaneElts;
16588 } else {
16589 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16590 return SDValue();
16591 LaneSrcs[Lane][1] = M / NumLaneElts;
16592 }
16593 }
16594
16595 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16596 return SDValue();
16597 }
16598
16599 SmallVector<int, 16> NewMask(NumElts, -1);
16600 for (int Lane = 0; Lane != NumLanes; ++Lane) {
38
Loop condition is true. Entering loop body
40
Loop condition is false. Execution continues on line 16609
16601 int Src = LaneSrcs[Lane][0];
16602 for (int i = 0; i != NumLaneElts; ++i) {
39
Loop condition is false. Execution continues on line 16600
16603 int M = -1;
16604 if (Src >= 0)
16605 M = Src * NumLaneElts + i;
16606 NewMask[Lane * NumLaneElts + i] = M;
16607 }
16608 }
16609 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16610 // Ensure we didn't get back the shuffle we started with.
16611 // FIXME: This is a hack to make up for some splat handling code in
16612 // getVectorShuffle.
16613 if (isa<ShuffleVectorSDNode>(NewV1) &&
41
Assuming 'NewV1' is not a 'ShuffleVectorSDNode'
42
Taking false branch
16614 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
16615 return SDValue();
16616
16617 for (int Lane = 0; Lane != NumLanes; ++Lane) {
43
Loop condition is true. Entering loop body
45
Loop condition is false. Execution continues on line 16626
16618 int Src = LaneSrcs[Lane][1];
16619 for (int i = 0; i != NumLaneElts; ++i) {
44
Loop condition is false. Execution continues on line 16617
16620 int M = -1;
16621 if (Src >= 0)
16622 M = Src * NumLaneElts + i;
16623 NewMask[Lane * NumLaneElts + i] = M;
16624 }
16625 }
16626 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
16627 // Ensure we didn't get back the shuffle we started with.
16628 // FIXME: This is a hack to make up for some splat handling code in
16629 // getVectorShuffle.
16630 if (isa<ShuffleVectorSDNode>(NewV2) &&
46
Assuming 'NewV2' is not a 'ShuffleVectorSDNode'
47
Taking false branch
16631 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
16632 return SDValue();
16633
16634 for (int i = 0; i != NumElts; ++i) {
48
Assuming 'i' is not equal to 'NumElts'
49
Loop condition is true. Entering loop body
16635 NewMask[i] = RepeatMask[i % NumLaneElts];
50
Division by zero
16636 if (NewMask[i] < 0)
16637 continue;
16638
16639 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16640 }
16641 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
16642}
16643
16644/// If the input shuffle mask results in a vector that is undefined in all upper
16645/// or lower half elements and that mask accesses only 2 halves of the
16646/// shuffle's operands, return true. A mask of half the width with mask indexes
16647/// adjusted to access the extracted halves of the original shuffle operands is
16648/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
16649/// lower half of each input operand is accessed.
16650static bool
16651getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
16652 int &HalfIdx1, int &HalfIdx2) {
16653 assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16654, __extension__ __PRETTY_FUNCTION__))
16654 "Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16654, __extension__ __PRETTY_FUNCTION__))
;
16655
16656 // Exactly one half of the result must be undef to allow narrowing.
16657 bool UndefLower = isUndefLowerHalf(Mask);
16658 bool UndefUpper = isUndefUpperHalf(Mask);
16659 if (UndefLower == UndefUpper)
16660 return false;
16661
16662 unsigned HalfNumElts = HalfMask.size();
16663 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16664 HalfIdx1 = -1;
16665 HalfIdx2 = -1;
16666 for (unsigned i = 0; i != HalfNumElts; ++i) {
16667 int M = Mask[i + MaskIndexOffset];
16668 if (M < 0) {
16669 HalfMask[i] = M;
16670 continue;
16671 }
16672
16673 // Determine which of the 4 half vectors this element is from.
16674 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
16675 int HalfIdx = M / HalfNumElts;
16676
16677 // Determine the element index into its half vector source.
16678 int HalfElt = M % HalfNumElts;
16679
16680 // We can shuffle with up to 2 half vectors, set the new 'half'
16681 // shuffle mask accordingly.
16682 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16683 HalfMask[i] = HalfElt;
16684 HalfIdx1 = HalfIdx;
16685 continue;
16686 }
16687 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16688 HalfMask[i] = HalfElt + HalfNumElts;
16689 HalfIdx2 = HalfIdx;
16690 continue;
16691 }
16692
16693 // Too many half vectors referenced.
16694 return false;
16695 }
16696
16697 return true;
16698}
16699
16700/// Given the output values from getHalfShuffleMask(), create a half width
16701/// shuffle of extracted vectors followed by an insert back to full width.
16702static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
16703 ArrayRef<int> HalfMask, int HalfIdx1,
16704 int HalfIdx2, bool UndefLower,
16705 SelectionDAG &DAG, bool UseConcat = false) {
16706 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16706, __extension__ __PRETTY_FUNCTION__))
;
16707 assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16707, __extension__ __PRETTY_FUNCTION__))
;
16708
16709 MVT VT = V1.getSimpleValueType();
16710 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16711 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16712
16713 auto getHalfVector = [&](int HalfIdx) {
16714 if (HalfIdx < 0)
16715 return DAG.getUNDEF(HalfVT);
16716 SDValue V = (HalfIdx < 2 ? V1 : V2);
16717 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16718 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
16719 DAG.getIntPtrConstant(HalfIdx, DL));
16720 };
16721
16722 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
16723 SDValue Half1 = getHalfVector(HalfIdx1);
16724 SDValue Half2 = getHalfVector(HalfIdx2);
16725 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
16726 if (UseConcat) {
16727 SDValue Op0 = V;
16728 SDValue Op1 = DAG.getUNDEF(HalfVT);
16729 if (UndefLower)
16730 std::swap(Op0, Op1);
16731 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
16732 }
16733
16734 unsigned Offset = UndefLower ? HalfNumElts : 0;
16735 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
16736 DAG.getIntPtrConstant(Offset, DL));
16737}
16738
16739/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
16740/// This allows for fast cases such as subvector extraction/insertion
16741/// or shuffling smaller vector types which can lower more efficiently.
16742static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
16743 SDValue V2, ArrayRef<int> Mask,
16744 const X86Subtarget &Subtarget,
16745 SelectionDAG &DAG) {
16746 assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16747, __extension__ __PRETTY_FUNCTION__))
16747 "Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16747, __extension__ __PRETTY_FUNCTION__))
;
16748
16749 bool UndefLower = isUndefLowerHalf(Mask);
16750 if (!UndefLower && !isUndefUpperHalf(Mask))
16751 return SDValue();
16752
16753 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16754, __extension__ __PRETTY_FUNCTION__))
16754 "Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16754, __extension__ __PRETTY_FUNCTION__))
;
16755
16756 // Upper half is undef and lower half is whole upper subvector.
16757 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
16758 MVT HalfVT = VT.getHalfNumVectorElementsVT();
16759 unsigned HalfNumElts = HalfVT.getVectorNumElements();
16760 if (!UndefLower &&
16761 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
16762 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16763 DAG.getIntPtrConstant(HalfNumElts, DL));
16764 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16765 DAG.getIntPtrConstant(0, DL));
16766 }
16767
16768 // Lower half is undef and upper half is whole lower subvector.
16769 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
16770 if (UndefLower &&
16771 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
16772 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
16773 DAG.getIntPtrConstant(0, DL));
16774 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
16775 DAG.getIntPtrConstant(HalfNumElts, DL));
16776 }
16777
16778 int HalfIdx1, HalfIdx2;
16779 SmallVector<int, 8> HalfMask(HalfNumElts);
16780 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
16781 return SDValue();
16782
16783 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16783, __extension__ __PRETTY_FUNCTION__))
;
16784
16785 // Only shuffle the halves of the inputs when useful.
16786 unsigned NumLowerHalves =
16787 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16788 unsigned NumUpperHalves =
16789 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16790 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16790, __extension__ __PRETTY_FUNCTION__))
;
16791
16792 // Determine the larger pattern of undef/halves, then decide if it's worth
16793 // splitting the shuffle based on subtarget capabilities and types.
16794 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16795 if (!UndefLower) {
16796 // XXXXuuuu: no insert is needed.
16797 // Always extract lowers when setting lower - these are all free subreg ops.
16798 if (NumUpperHalves == 0)
16799 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16800 UndefLower, DAG);
16801
16802 if (NumUpperHalves == 1) {
16803 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16804 if (Subtarget.hasAVX2()) {
16805 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16806 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16807 !is128BitUnpackShuffleMask(HalfMask) &&
16808 (!isSingleSHUFPSMask(HalfMask) ||
16809 Subtarget.hasFastVariableCrossLaneShuffle()))
16810 return SDValue();
16811 // If this is a unary shuffle (assume that the 2nd operand is
16812 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16813 // are better off extracting the upper half of 1 operand and using a
16814 // narrow shuffle.
16815 if (EltWidth == 64 && V2.isUndef())
16816 return SDValue();
16817 }
16818 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16819 if (Subtarget.hasAVX512() && VT.is512BitVector())
16820 return SDValue();
16821 // Extract + narrow shuffle is better than the wide alternative.
16822 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16823 UndefLower, DAG);
16824 }
16825
16826 // Don't extract both uppers, instead shuffle and then extract.
16827 assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16827, __extension__ __PRETTY_FUNCTION__))
;
16828 return SDValue();
16829 }
16830
16831 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16832 if (NumUpperHalves == 0) {
16833 // AVX2 has efficient 64-bit element cross-lane shuffles.
16834 // TODO: Refine to account for unary shuffle, splat, and other masks?
16835 if (Subtarget.hasAVX2() && EltWidth == 64)
16836 return SDValue();
16837 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16838 if (Subtarget.hasAVX512() && VT.is512BitVector())
16839 return SDValue();
16840 // Narrow shuffle + insert is better than the wide alternative.
16841 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16842 UndefLower, DAG);
16843 }
16844
16845 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16846 return SDValue();
16847}
16848
16849/// Test whether the specified input (0 or 1) is in-place blended by the
16850/// given mask.
16851///
16852/// This returns true if the elements from a particular input are already in the
16853/// slot required by the given mask and require no permutation.
16854static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
16855 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16855, __extension__ __PRETTY_FUNCTION__))
;
16856 int Size = Mask.size();
16857 for (int i = 0; i < Size; ++i)
16858 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
16859 return false;
16860
16861 return true;
16862}
16863
16864/// Handle case where shuffle sources are coming from the same 128-bit lane and
16865/// every lane can be represented as the same repeating mask - allowing us to
16866/// shuffle the sources with the repeating shuffle and then permute the result
16867/// to the destination lanes.
16868static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
16869 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16870 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16871 int NumElts = VT.getVectorNumElements();
16872 int NumLanes = VT.getSizeInBits() / 128;
16873 int NumLaneElts = NumElts / NumLanes;
16874
16875 // On AVX2 we may be able to just shuffle the lowest elements and then
16876 // broadcast the result.
16877 if (Subtarget.hasAVX2()) {
16878 for (unsigned BroadcastSize : {16, 32, 64}) {
16879 if (BroadcastSize <= VT.getScalarSizeInBits())
16880 continue;
16881 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16882
16883 // Attempt to match a repeating pattern every NumBroadcastElts,
16884 // accounting for UNDEFs but only references the lowest 128-bit
16885 // lane of the inputs.
16886 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16887 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16888 for (int j = 0; j != NumBroadcastElts; ++j) {
16889 int M = Mask[i + j];
16890 if (M < 0)
16891 continue;
16892 int &R = RepeatMask[j];
16893 if (0 != ((M % NumElts) / NumLaneElts))
16894 return false;
16895 if (0 <= R && R != M)
16896 return false;
16897 R = M;
16898 }
16899 return true;
16900 };
16901
16902 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16903 if (!FindRepeatingBroadcastMask(RepeatMask))
16904 continue;
16905
16906 // Shuffle the (lowest) repeated elements in place for broadcast.
16907 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16908
16909 // Shuffle the actual broadcast.
16910 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16911 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16912 for (int j = 0; j != NumBroadcastElts; ++j)
16913 BroadcastMask[i + j] = j;
16914 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16915 BroadcastMask);
16916 }
16917 }
16918
16919 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16920 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16921 return SDValue();
16922
16923 // Bail if we already have a repeated lane shuffle mask.
16924 SmallVector<int, 8> RepeatedShuffleMask;
16925 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
16926 return SDValue();
16927
16928 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16929 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
16930 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
16931 int NumSubLanes = NumLanes * SubLaneScale;
16932 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16933
16934 // Check that all the sources are coming from the same lane and see if we can
16935 // form a repeating shuffle mask (local to each sub-lane). At the same time,
16936 // determine the source sub-lane for each destination sub-lane.
16937 int TopSrcSubLane = -1;
16938 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16939 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
16940 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
16941 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
16942
16943 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16944 // Extract the sub-lane mask, check that it all comes from the same lane
16945 // and normalize the mask entries to come from the first lane.
16946 int SrcLane = -1;
16947 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16948 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16949 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16950 if (M < 0)
16951 continue;
16952 int Lane = (M % NumElts) / NumLaneElts;
16953 if ((0 <= SrcLane) && (SrcLane != Lane))
16954 return SDValue();
16955 SrcLane = Lane;
16956 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16957 SubLaneMask[Elt] = LocalM;
16958 }
16959
16960 // Whole sub-lane is UNDEF.
16961 if (SrcLane < 0)
16962 continue;
16963
16964 // Attempt to match against the candidate repeated sub-lane masks.
16965 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16966 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16967 for (int i = 0; i != NumSubLaneElts; ++i) {
16968 if (M1[i] < 0 || M2[i] < 0)
16969 continue;
16970 if (M1[i] != M2[i])
16971 return false;
16972 }
16973 return true;
16974 };
16975
16976 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16977 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16978 continue;
16979
16980 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16981 for (int i = 0; i != NumSubLaneElts; ++i) {
16982 int M = SubLaneMask[i];
16983 if (M < 0)
16984 continue;
16985 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16986, __extension__ __PRETTY_FUNCTION__))
16986 "Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16986, __extension__ __PRETTY_FUNCTION__))
;
16987 RepeatedSubLaneMask[i] = M;
16988 }
16989
16990 // Track the top most source sub-lane - by setting the remaining to UNDEF
16991 // we can greatly simplify shuffle matching.
16992 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16993 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16994 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16995 break;
16996 }
16997
16998 // Bail if we failed to find a matching repeated sub-lane mask.
16999 if (Dst2SrcSubLanes[DstSubLane] < 0)
17000 return SDValue();
17001 }
17002 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17003, __extension__ __PRETTY_FUNCTION__))
17003 "Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17003, __extension__ __PRETTY_FUNCTION__))
;
17004
17005 // Create a repeating shuffle mask for the entire vector.
17006 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
17007 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
17008 int Lane = SubLane / SubLaneScale;
17009 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
17010 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
17011 int M = RepeatedSubLaneMask[Elt];
17012 if (M < 0)
17013 continue;
17014 int Idx = (SubLane * NumSubLaneElts) + Elt;
17015 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
17016 }
17017 }
17018 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
17019
17020 // Shuffle each source sub-lane to its destination.
17021 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
17022 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
17023 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
17024 if (SrcSubLane < 0)
17025 continue;
17026 for (int j = 0; j != NumSubLaneElts; ++j)
17027 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
17028 }
17029
17030 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
17031 SubLaneMask);
17032}
17033
17034static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
17035 bool &ForceV1Zero, bool &ForceV2Zero,
17036 unsigned &ShuffleImm, ArrayRef<int> Mask,
17037 const APInt &Zeroable) {
17038 int NumElts = VT.getVectorNumElements();
17039 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17041, __extension__ __PRETTY_FUNCTION__))
17040 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17041, __extension__ __PRETTY_FUNCTION__))
17041 "Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17041, __extension__ __PRETTY_FUNCTION__))
;
17042 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17043, __extension__ __PRETTY_FUNCTION__))
17043 "Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17043, __extension__ __PRETTY_FUNCTION__))
;
17044
17045 bool ZeroLane[2] = { true, true };
17046 for (int i = 0; i < NumElts; ++i)
17047 ZeroLane[i & 1] &= Zeroable[i];
17048
17049 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
17050 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
17051 ShuffleImm = 0;
17052 bool ShufpdMask = true;
17053 bool CommutableMask = true;
17054 for (int i = 0; i < NumElts; ++i) {
17055 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
17056 continue;
17057 if (Mask[i] < 0)
17058 return false;
17059 int Val = (i & 6) + NumElts * (i & 1);
17060 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
17061 if (Mask[i] < Val || Mask[i] > Val + 1)
17062 ShufpdMask = false;
17063 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
17064 CommutableMask = false;
17065 ShuffleImm |= (Mask[i] % 2) << i;
17066 }
17067
17068 if (!ShufpdMask && !CommutableMask)
17069 return false;
17070
17071 if (!ShufpdMask && CommutableMask)
17072 std::swap(V1, V2);
17073
17074 ForceV1Zero = ZeroLane[0];
17075 ForceV2Zero = ZeroLane[1];
17076 return true;
17077}
17078
17079static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
17080 SDValue V2, ArrayRef<int> Mask,
17081 const APInt &Zeroable,
17082 const X86Subtarget &Subtarget,
17083 SelectionDAG &DAG) {
17084 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17085, __extension__ __PRETTY_FUNCTION__))
17085 "Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17085, __extension__ __PRETTY_FUNCTION__))
;
17086
17087 unsigned Immediate = 0;
17088 bool ForceV1Zero = false, ForceV2Zero = false;
17089 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
17090 Mask, Zeroable))
17091 return SDValue();
17092
17093 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
17094 if (ForceV1Zero)
17095 V1 = getZeroVector(VT, Subtarget, DAG, DL);
17096 if (ForceV2Zero)
17097 V2 = getZeroVector(VT, Subtarget, DAG, DL);
17098
17099 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
17100 DAG.getTargetConstant(Immediate, DL, MVT::i8));
17101}
17102
17103// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17104// by zeroable elements in the remaining 24 elements. Turn this into two
17105// vmovqb instructions shuffled together.
17106static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
17107 SDValue V1, SDValue V2,
17108 ArrayRef<int> Mask,
17109 const APInt &Zeroable,
17110 SelectionDAG &DAG) {
17111 assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17111, __extension__ __PRETTY_FUNCTION__))
;
17112
17113 // The first 8 indices should be every 8th element.
17114 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
17115 return SDValue();
17116
17117 // Remaining elements need to be zeroable.
17118 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
17119 return SDValue();
17120
17121 V1 = DAG.getBitcast(MVT::v4i64, V1);
17122 V2 = DAG.getBitcast(MVT::v4i64, V2);
17123
17124 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
17125 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
17126
17127 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
17128 // the upper bits of the result using an unpckldq.
17129 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
17130 { 0, 1, 2, 3, 16, 17, 18, 19,
17131 4, 5, 6, 7, 20, 21, 22, 23 });
17132 // Insert the unpckldq into a zero vector to widen to v32i8.
17133 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
17134 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
17135 DAG.getIntPtrConstant(0, DL));
17136}
17137
17138
17139/// Handle lowering of 4-lane 64-bit floating point shuffles.
17140///
17141/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
17142/// isn't available.
17143static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17144 const APInt &Zeroable, SDValue V1, SDValue V2,
17145 const X86Subtarget &Subtarget,
17146 SelectionDAG &DAG) {
17147 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17147, __extension__ __PRETTY_FUNCTION__))
;
17148 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17148, __extension__ __PRETTY_FUNCTION__))
;
17149 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17149, __extension__ __PRETTY_FUNCTION__))
;
17150
17151 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
17152 Subtarget, DAG))
17153 return V;
17154
17155 if (V2.isUndef()) {
17156 // Check for being able to broadcast a single element.
17157 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
17158 Mask, Subtarget, DAG))
17159 return Broadcast;
17160
17161 // Use low duplicate instructions for masks that match their pattern.
17162 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
17163 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
17164
17165 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
17166 // Non-half-crossing single input shuffles can be lowered with an
17167 // interleaved permutation.
17168 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17169 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17170 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
17171 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17172 }
17173
17174 // With AVX2 we have direct support for this permutation.
17175 if (Subtarget.hasAVX2())
17176 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
17177 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17178
17179 // Try to create an in-lane repeating shuffle mask and then shuffle the
17180 // results into the target lanes.
17181 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17182 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17183 return V;
17184
17185 // Try to permute the lanes and then use a per-lane permute.
17186 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
17187 Mask, DAG, Subtarget))
17188 return V;
17189
17190 // Otherwise, fall back.
17191 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
17192 DAG, Subtarget);
17193 }
17194
17195 // Use dedicated unpack instructions for masks that match their pattern.
17196 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
17197 return V;
17198
17199 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
17200 Zeroable, Subtarget, DAG))
17201 return Blend;
17202
17203 // Check if the blend happens to exactly fit that of SHUFPD.
17204 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
17205 Zeroable, Subtarget, DAG))
17206 return Op;
17207
17208 // If we have lane crossing shuffles AND they don't all come from the lower
17209 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
17210 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
17211 // canonicalize to a blend of splat which isn't necessary for this combine.
17212 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
17213 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
17214 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
17215 (V2.getOpcode() != ISD::BUILD_VECTOR))
17216 if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
17217 Mask, DAG))
17218 return Op;
17219
17220 // If we have one input in place, then we can permute the other input and
17221 // blend the result.
17222 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17223 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17224 Subtarget, DAG);
17225
17226 // Try to create an in-lane repeating shuffle mask and then shuffle the
17227 // results into the target lanes.
17228 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17229 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17230 return V;
17231
17232 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17233 // shuffle. However, if we have AVX2 and either inputs are already in place,
17234 // we will be able to shuffle even across lanes the other input in a single
17235 // instruction so skip this pattern.
17236 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
17237 isShuffleMaskInputInPlace(1, Mask))))
17238 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
17239 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17240 return V;
17241
17242 // If we have VLX support, we can use VEXPAND.
17243 if (Subtarget.hasVLX())
17244 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
17245 DAG, Subtarget))
17246 return V;
17247
17248 // If we have AVX2 then we always want to lower with a blend because an v4 we
17249 // can fully permute the elements.
17250 if (Subtarget.hasAVX2())
17251 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
17252 Subtarget, DAG);
17253
17254 // Otherwise fall back on generic lowering.
17255 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
17256 Subtarget, DAG);
17257}
17258
17259/// Handle lowering of 4-lane 64-bit integer shuffles.
17260///
17261/// This routine is only called when we have AVX2 and thus a reasonable
17262/// instruction set for v4i64 shuffling..
17263static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17264 const APInt &Zeroable, SDValue V1, SDValue V2,
17265 const X86Subtarget &Subtarget,
17266 SelectionDAG &DAG) {
17267 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17267, __extension__ __PRETTY_FUNCTION__))
;
17268 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17268, __extension__ __PRETTY_FUNCTION__))
;
17269 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17269, __extension__ __PRETTY_FUNCTION__))
;
17270 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17270, __extension__ __PRETTY_FUNCTION__))
;
17271
17272 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
17273 Subtarget, DAG))
17274 return V;
17275
17276 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
17277 Zeroable, Subtarget, DAG))
17278 return Blend;
17279
17280 // Check for being able to broadcast a single element.
17281 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
17282 Subtarget, DAG))
17283 return Broadcast;
17284
17285 if (V2.isUndef()) {
17286 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17287 // can use lower latency instructions that will operate on both lanes.
17288 SmallVector<int, 2> RepeatedMask;
17289 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
17290 SmallVector<int, 4> PSHUFDMask;
17291 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
17292 return DAG.getBitcast(
17293 MVT::v4i64,
17294 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
17295 DAG.getBitcast(MVT::v8i32, V1),
17296 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17297 }
17298
17299 // AVX2 provides a direct instruction for permuting a single input across
17300 // lanes.
17301 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
17302 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
17303 }
17304
17305 // Try to use shift instructions.
17306 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
17307 Zeroable, Subtarget, DAG))
17308 return Shift;
17309
17310 // If we have VLX support, we can use VALIGN or VEXPAND.
17311 if (Subtarget.hasVLX()) {
17312 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
17313 Subtarget, DAG))
17314 return Rotate;
17315
17316 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
17317 DAG, Subtarget))
17318 return V;
17319 }
17320
17321 // Try to use PALIGNR.
17322 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
17323 Subtarget, DAG))
17324 return Rotate;
17325
17326 // Use dedicated unpack instructions for masks that match their pattern.
17327 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
17328 return V;
17329
17330 // If we have one input in place, then we can permute the other input and
17331 // blend the result.
17332 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
17333 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17334 Subtarget, DAG);
17335
17336 // Try to create an in-lane repeating shuffle mask and then shuffle the
17337 // results into the target lanes.
17338 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17339 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17340 return V;
17341
17342 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17343 // shuffle. However, if we have AVX2 and either inputs are already in place,
17344 // we will be able to shuffle even across lanes the other input in a single
17345 // instruction so skip this pattern.
17346 if (!isShuffleMaskInputInPlace(0, Mask) &&
17347 !isShuffleMaskInputInPlace(1, Mask))
17348 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17349 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17350 return Result;
17351
17352 // Otherwise fall back on generic blend lowering.
17353 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
17354 Subtarget, DAG);
17355}
17356
17357/// Handle lowering of 8-lane 32-bit floating point shuffles.
17358///
17359/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
17360/// isn't available.
17361static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17362 const APInt &Zeroable, SDValue V1, SDValue V2,
17363 const X86Subtarget &Subtarget,
17364 SelectionDAG &DAG) {
17365 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17365, __extension__ __PRETTY_FUNCTION__))
;
17366 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17366, __extension__ __PRETTY_FUNCTION__))
;
17367 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17367, __extension__ __PRETTY_FUNCTION__))
;
17368
17369 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
17370 Zeroable, Subtarget, DAG))
17371 return Blend;
17372
17373 // Check for being able to broadcast a single element.
17374 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
17375 Subtarget, DAG))
17376 return Broadcast;
17377
17378 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17379 // options to efficiently lower the shuffle.
17380 SmallVector<int, 4> RepeatedMask;
17381 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
17382 assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17383, __extension__ __PRETTY_FUNCTION__))
17383 "Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17383, __extension__ __PRETTY_FUNCTION__))
;
17384
17385 // Use even/odd duplicate instructions for masks that match their pattern.
17386 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17387 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
17388 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17389 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
17390
17391 if (V2.isUndef())
17392 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
17393 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17394
17395 // Use dedicated unpack instructions for masks that match their pattern.
17396 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
17397 return V;
17398
17399 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
17400 // have already handled any direct blends.
17401 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
17402 }
17403
17404 // Try to create an in-lane repeating shuffle mask and then shuffle the
17405 // results into the target lanes.
17406 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17407 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17408 return V;
17409
17410 // If we have a single input shuffle with different shuffle patterns in the
17411 // two 128-bit lanes use the variable mask to VPERMILPS.
17412 if (V2.isUndef()) {
17413 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
17414 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17415 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
17416 }
17417 if (Subtarget.hasAVX2()) {
17418 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17419 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
17420 }
17421 // Otherwise, fall back.
17422 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
17423 DAG, Subtarget);
17424 }
17425
17426 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17427 // shuffle.
17428 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17429 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17430 return Result;
17431
17432 // If we have VLX support, we can use VEXPAND.
17433 if (Subtarget.hasVLX())
17434 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
17435 DAG, Subtarget))
17436 return V;
17437
17438 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17439 // since after split we get a more efficient code using vpunpcklwd and
17440 // vpunpckhwd instrs than vblend.
17441 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
17442 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
17443 DAG);
17444
17445 // If we have AVX2 then we always want to lower with a blend because at v8 we
17446 // can fully permute the elements.
17447 if (Subtarget.hasAVX2())
17448 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
17449 Subtarget, DAG);
17450
17451 // Otherwise fall back on generic lowering.
17452 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
17453 Subtarget, DAG);
17454}
17455
17456/// Handle lowering of 8-lane 32-bit integer shuffles.
17457///
17458/// This routine is only called when we have AVX2 and thus a reasonable
17459/// instruction set for v8i32 shuffling..
17460static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17461 const APInt &Zeroable, SDValue V1, SDValue V2,
17462 const X86Subtarget &Subtarget,
17463 SelectionDAG &DAG) {
17464 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17464, __extension__ __PRETTY_FUNCTION__))
;
17465 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17465, __extension__ __PRETTY_FUNCTION__))
;
17466 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17466, __extension__ __PRETTY_FUNCTION__))
;
17467 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17467, __extension__ __PRETTY_FUNCTION__))
;
17468
17469 // Whenever we can lower this as a zext, that instruction is strictly faster
17470 // than any alternative. It also allows us to fold memory operands into the
17471 // shuffle in many cases.
17472 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
17473 Zeroable, Subtarget, DAG))
17474 return ZExt;
17475
17476 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
17477 // since after split we get a more efficient code than vblend by using
17478 // vpunpcklwd and vpunpckhwd instrs.
17479 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
17480 !Subtarget.hasAVX512())
17481 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
17482 DAG);
17483
17484 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
17485 Zeroable, Subtarget, DAG))
17486 return Blend;
17487
17488 // Check for being able to broadcast a single element.
17489 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
17490 Subtarget, DAG))
17491 return Broadcast;
17492
17493 // If the shuffle mask is repeated in each 128-bit lane we can use more
17494 // efficient instructions that mirror the shuffles across the two 128-bit
17495 // lanes.
17496 SmallVector<int, 4> RepeatedMask;
17497 bool Is128BitLaneRepeatedShuffle =
17498 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
17499 if (Is128BitLaneRepeatedShuffle) {
17500 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17500, __extension__ __PRETTY_FUNCTION__))
;
17501 if (V2.isUndef())
17502 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
17503 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17504
17505 // Use dedicated unpack instructions for masks that match their pattern.
17506 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
17507 return V;
17508 }
17509
17510 // Try to use shift instructions.
17511 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
17512 Zeroable, Subtarget, DAG))
17513 return Shift;
17514
17515 // If we have VLX support, we can use VALIGN or EXPAND.
17516 if (Subtarget.hasVLX()) {
17517 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
17518 Subtarget, DAG))
17519 return Rotate;
17520
17521 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
17522 DAG, Subtarget))
17523 return V;
17524 }
17525
17526 // Try to use byte rotation instructions.
17527 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
17528 Subtarget, DAG))
17529 return Rotate;
17530
17531 // Try to create an in-lane repeating shuffle mask and then shuffle the
17532 // results into the target lanes.
17533 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17534 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17535 return V;
17536
17537 if (V2.isUndef()) {
17538 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17539 // because that should be faster than the variable permute alternatives.
17540 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
17541 return V;
17542
17543 // If the shuffle patterns aren't repeated but it's a single input, directly
17544 // generate a cross-lane VPERMD instruction.
17545 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
17546 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
17547 }
17548
17549 // Assume that a single SHUFPS is faster than an alternative sequence of
17550 // multiple instructions (even if the CPU has a domain penalty).
17551 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17552 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17553 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
17554 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
17555 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
17556 CastV1, CastV2, DAG);
17557 return DAG.getBitcast(MVT::v8i32, ShufPS);
17558 }
17559
17560 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17561 // shuffle.
17562 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17563 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17564 return Result;
17565
17566 // Otherwise fall back on generic blend lowering.
17567 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
17568 Subtarget, DAG);
17569}
17570
17571/// Handle lowering of 16-lane 16-bit integer shuffles.
17572///
17573/// This routine is only called when we have AVX2 and thus a reasonable
17574/// instruction set for v16i16 shuffling..
17575static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17576 const APInt &Zeroable, SDValue V1, SDValue V2,
17577 const X86Subtarget &Subtarget,
17578 SelectionDAG &DAG) {
17579 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17579, __extension__ __PRETTY_FUNCTION__))
;
17580 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17580, __extension__ __PRETTY_FUNCTION__))
;
17581 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17581, __extension__ __PRETTY_FUNCTION__))
;
17582 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17582, __extension__ __PRETTY_FUNCTION__))
;
17583
17584 // Whenever we can lower this as a zext, that instruction is strictly faster
17585 // than any alternative. It also allows us to fold memory operands into the
17586 // shuffle in many cases.
17587 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17588 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17589 return ZExt;
17590
17591 // Check for being able to broadcast a single element.
17592 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
17593 Subtarget, DAG))
17594 return Broadcast;
17595
17596 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
17597 Zeroable, Subtarget, DAG))
17598 return Blend;
17599
17600 // Use dedicated unpack instructions for masks that match their pattern.
17601 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
17602 return V;
17603
17604 // Use dedicated pack instructions for masks that match their pattern.
17605 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
17606 Subtarget))
17607 return V;
17608
17609 // Try to use lower using a truncation.
17610 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17611 Subtarget, DAG))
17612 return V;
17613
17614 // Try to use shift instructions.
17615 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
17616 Zeroable, Subtarget, DAG))
17617 return Shift;
17618
17619 // Try to use byte rotation instructions.
17620 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17621 Subtarget, DAG))
17622 return Rotate;
17623
17624 // Try to create an in-lane repeating shuffle mask and then shuffle the
17625 // results into the target lanes.
17626 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17627 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17628 return V;
17629
17630 if (V2.isUndef()) {
17631 // Try to use bit rotation instructions.
17632 if (SDValue Rotate =
17633 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17634 return Rotate;
17635
17636 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17637 // because that should be faster than the variable permute alternatives.
17638 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
17639 return V;
17640
17641 // There are no generalized cross-lane shuffle operations available on i16
17642 // element types.
17643 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17644 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17645 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17646 return V;
17647
17648 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17649 DAG, Subtarget);
17650 }
17651
17652 SmallVector<int, 8> RepeatedMask;
17653 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17654 // As this is a single-input shuffle, the repeated mask should be
17655 // a strictly valid v8i16 mask that we can pass through to the v8i16
17656 // lowering to handle even the v16 case.
17657 return lowerV8I16GeneralSingleInputShuffle(
17658 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17659 }
17660 }
17661
17662 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17663 Zeroable, Subtarget, DAG))
17664 return PSHUFB;
17665
17666 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17667 if (Subtarget.hasBWI())
17668 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17669
17670 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17671 // shuffle.
17672 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17673 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17674 return Result;
17675
17676 // Try to permute the lanes and then use a per-lane permute.
17677 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17678 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17679 return V;
17680
17681 // Otherwise fall back on generic lowering.
17682 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
17683 Subtarget, DAG);
17684}
17685
17686/// Handle lowering of 32-lane 8-bit integer shuffles.
17687///
17688/// This routine is only called when we have AVX2 and thus a reasonable
17689/// instruction set for v32i8 shuffling..
17690static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17691 const APInt &Zeroable, SDValue V1, SDValue V2,
17692 const X86Subtarget &Subtarget,
17693 SelectionDAG &DAG) {
17694 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17694, __extension__ __PRETTY_FUNCTION__))
;
17695 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17695, __extension__ __PRETTY_FUNCTION__))
;
17696 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17696, __extension__ __PRETTY_FUNCTION__))
;
17697 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17697, __extension__ __PRETTY_FUNCTION__))
;
17698
17699 // Whenever we can lower this as a zext, that instruction is strictly faster
17700 // than any alternative. It also allows us to fold memory operands into the
17701 // shuffle in many cases.
17702 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17703 Zeroable, Subtarget, DAG))
17704 return ZExt;
17705
17706 // Check for being able to broadcast a single element.
17707 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17708 Subtarget, DAG))
17709 return Broadcast;
17710
17711 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17712 Zeroable, Subtarget, DAG))
17713 return Blend;
17714
17715 // Use dedicated unpack instructions for masks that match their pattern.
17716 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
17717 return V;
17718
17719 // Use dedicated pack instructions for masks that match their pattern.
17720 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
17721 Subtarget))
17722 return V;
17723
17724 // Try to use lower using a truncation.
17725 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17726 Subtarget, DAG))
17727 return V;
17728
17729 // Try to use shift instructions.
17730 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
17731 Zeroable, Subtarget, DAG))
17732 return Shift;
17733
17734 // Try to use byte rotation instructions.
17735 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17736 Subtarget, DAG))
17737 return Rotate;
17738
17739 // Try to use bit rotation instructions.
17740 if (V2.isUndef())
17741 if (SDValue Rotate =
17742 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17743 return Rotate;
17744
17745 // Try to create an in-lane repeating shuffle mask and then shuffle the
17746 // results into the target lanes.
17747 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17748 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17749 return V;
17750
17751 // There are no generalized cross-lane shuffle operations available on i8
17752 // element types.
17753 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17754 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17755 // because that should be faster than the variable permute alternatives.
17756 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
17757 return V;
17758
17759 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17760 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17761 return V;
17762
17763 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17764 DAG, Subtarget);
17765 }
17766
17767 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17768 Zeroable, Subtarget, DAG))
17769 return PSHUFB;
17770
17771 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17772 if (Subtarget.hasVBMI())
17773 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17774
17775 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17776 // shuffle.
17777 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17778 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17779 return Result;
17780
17781 // Try to permute the lanes and then use a per-lane permute.
17782 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
17783 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17784 return V;
17785
17786 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17787 // by zeroable elements in the remaining 24 elements. Turn this into two
17788 // vmovqb instructions shuffled together.
17789 if (Subtarget.hasVLX())
17790 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17791 Mask, Zeroable, DAG))
17792 return V;
17793
17794 // Otherwise fall back on generic lowering.
17795 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
17796 Subtarget, DAG);
17797}
17798
17799/// High-level routine to lower various 256-bit x86 vector shuffles.
17800///
17801/// This routine either breaks down the specific type of a 256-bit x86 vector
17802/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17803/// together based on the available instructions.
17804static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
17805 SDValue V1, SDValue V2, const APInt &Zeroable,
17806 const X86Subtarget &Subtarget,
17807 SelectionDAG &DAG) {
17808 // If we have a single input to the zero element, insert that into V1 if we
17809 // can do so cheaply.
17810 int NumElts = VT.getVectorNumElements();
17811 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17812
17813 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17814 if (SDValue Insertion = lowerShuffleAsElementInsertion(
17815 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17816 return Insertion;
17817
17818 // Handle special cases where the lower or upper half is UNDEF.
17819 if (SDValue V =
17820 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17821 return V;
17822
17823 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17824 // can check for those subtargets here and avoid much of the subtarget
17825 // querying in the per-vector-type lowering routines. With AVX1 we have
17826 // essentially *zero* ability to manipulate a 256-bit vector with integer
17827 // types. Since we'll use floating point types there eventually, just
17828 // immediately cast everything to a float and operate entirely in that domain.
17829 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17830 int ElementBits = VT.getScalarSizeInBits();
17831 if (ElementBits < 32) {
17832 // No floating point type available, if we can't use the bit operations
17833 // for masking/blending then decompose into 128-bit vectors.
17834 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17835 Subtarget, DAG))
17836 return V;
17837 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17838 return V;
17839 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17840 }
17841
17842 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17843 VT.getVectorNumElements());
17844 V1 = DAG.getBitcast(FpVT, V1);
17845 V2 = DAG.getBitcast(FpVT, V2);
17846 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17847 }
17848
17849 if (VT == MVT::v16f16) {
17850 V1 = DAG.getBitcast(MVT::v16i16, V1);
17851 V2 = DAG.getBitcast(MVT::v16i16, V2);
17852 return DAG.getBitcast(MVT::v16f16,
17853 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17854 }
17855
17856 switch (VT.SimpleTy) {
17857 case MVT::v4f64:
17858 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17859 case MVT::v4i64:
17860 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17861 case MVT::v8f32:
17862 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17863 case MVT::v8i32:
17864 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17865 case MVT::v16i16:
17866 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17867 case MVT::v32i8:
17868 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17869
17870 default:
17871 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17871)
;
17872 }
17873}
17874
17875/// Try to lower a vector shuffle as a 128-bit shuffles.
17876static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
17877 const APInt &Zeroable, SDValue V1, SDValue V2,
17878 const X86Subtarget &Subtarget,
17879 SelectionDAG &DAG) {
17880 assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17881, __extension__ __PRETTY_FUNCTION__))
17881 "Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17881, __extension__ __PRETTY_FUNCTION__))
;
17882
17883 // To handle 256 bit vector requires VLX and most probably
17884 // function lowerV2X128VectorShuffle() is better solution.
17885 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17885, __extension__ __PRETTY_FUNCTION__))
;
17886
17887 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17888 SmallVector<int, 4> Widened128Mask;
17889 if (!canWidenShuffleElements(Mask, Widened128Mask))
17890 return SDValue();
17891 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17891, __extension__ __PRETTY_FUNCTION__))
;
17892
17893 // Try to use an insert into a zero vector.
17894 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17895 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17896 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17897 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17898 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17899 DAG.getIntPtrConstant(0, DL));
17900 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17901 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17902 DAG.getIntPtrConstant(0, DL));
17903 }
17904
17905 // Check for patterns which can be matched with a single insert of a 256-bit
17906 // subvector.
17907 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17908 if (OnlyUsesV1 ||
17909 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17910 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17911 SDValue SubVec =
17912 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17913 DAG.getIntPtrConstant(0, DL));
17914 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17915 DAG.getIntPtrConstant(4, DL));
17916 }
17917
17918 // See if this is an insertion of the lower 128-bits of V2 into V1.
17919 bool IsInsert = true;
17920 int V2Index = -1;
17921 for (int i = 0; i < 4; ++i) {
17922 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17922, __extension__ __PRETTY_FUNCTION__))
;
17923 if (Widened128Mask[i] < 0)
17924 continue;
17925
17926 // Make sure all V1 subvectors are in place.
17927 if (Widened128Mask[i] < 4) {
17928 if (Widened128Mask[i] != i) {
17929 IsInsert = false;
17930 break;
17931 }
17932 } else {
17933 // Make sure we only have a single V2 index and its the lowest 128-bits.
17934 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17935 IsInsert = false;
17936 break;
17937 }
17938 V2Index = i;
17939 }
17940 }
17941 if (IsInsert && V2Index >= 0) {
17942 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17943 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17944 DAG.getIntPtrConstant(0, DL));
17945 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17946 }
17947
17948 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17949 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17950 // possible we at least ensure the lanes stay sequential to help later
17951 // combines.
17952 SmallVector<int, 2> Widened256Mask;
17953 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17954 Widened128Mask.clear();
17955 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17956 }
17957
17958 // Try to lower to vshuf64x2/vshuf32x4.
17959 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17960 unsigned PermMask = 0;
17961 // Insure elements came from the same Op.
17962 for (int i = 0; i < 4; ++i) {
17963 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17963, __extension__ __PRETTY_FUNCTION__))
;
17964 if (Widened128Mask[i] < 0)
17965 continue;
17966
17967 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17968 unsigned OpIndex = i / 2;
17969 if (Ops[OpIndex].isUndef())
17970 Ops[OpIndex] = Op;
17971 else if (Ops[OpIndex] != Op)
17972 return SDValue();
17973
17974 // Convert the 128-bit shuffle mask selection values into 128-bit selection
17975 // bits defined by a vshuf64x2 instruction's immediate control byte.
17976 PermMask |= (Widened128Mask[i] % 4) << (i * 2);
17977 }
17978
17979 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17980 DAG.getTargetConstant(PermMask, DL, MVT::i8));
17981}
17982
17983/// Handle lowering of 8-lane 64-bit floating point shuffles.
17984static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17985 const APInt &Zeroable, SDValue V1, SDValue V2,
17986 const X86Subtarget &Subtarget,
17987 SelectionDAG &DAG) {
17988 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17988, __extension__ __PRETTY_FUNCTION__))
;
17989 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17989, __extension__ __PRETTY_FUNCTION__))
;
17990 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17990, __extension__ __PRETTY_FUNCTION__))
;
17991
17992 if (V2.isUndef()) {
17993 // Use low duplicate instructions for masks that match their pattern.
17994 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17995 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17996
17997 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17998 // Non-half-crossing single input shuffles can be lowered with an
17999 // interleaved permutation.
18000 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
18001 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
18002 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
18003 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
18004 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
18005 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
18006 }
18007
18008 SmallVector<int, 4> RepeatedMask;
18009 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
18010 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
18011 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18012 }
18013
18014 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
18015 V2, Subtarget, DAG))
18016 return Shuf128;
18017
18018 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
18019 return Unpck;
18020
18021 // Check if the blend happens to exactly fit that of SHUFPD.
18022 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
18023 Zeroable, Subtarget, DAG))
18024 return Op;
18025
18026 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
18027 DAG, Subtarget))
18028 return V;
18029
18030 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
18031 Zeroable, Subtarget, DAG))
18032 return Blend;
18033
18034 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
18035}
18036
18037/// Handle lowering of 16-lane 32-bit floating point shuffles.
18038static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18039 const APInt &Zeroable, SDValue V1, SDValue V2,
18040 const X86Subtarget &Subtarget,
18041 SelectionDAG &DAG) {
18042 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18042, __extension__ __PRETTY_FUNCTION__))
;
18043 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18043, __extension__ __PRETTY_FUNCTION__))
;
18044 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18044, __extension__ __PRETTY_FUNCTION__))
;
18045
18046 // If the shuffle mask is repeated in each 128-bit lane, we have many more
18047 // options to efficiently lower the shuffle.
18048 SmallVector<int, 4> RepeatedMask;
18049 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
18050 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18050, __extension__ __PRETTY_FUNCTION__))
;
18051
18052 // Use even/odd duplicate instructions for masks that match their pattern.
18053 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
18054 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
18055 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
18056 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
18057
18058 if (V2.isUndef())
18059 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
18060 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18061
18062 // Use dedicated unpack instructions for masks that match their pattern.
18063 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
18064 return V;
18065
18066 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
18067 Zeroable, Subtarget, DAG))
18068 return Blend;
18069
18070 // Otherwise, fall back to a SHUFPS sequence.
18071 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
18072 }
18073
18074 // Try to create an in-lane repeating shuffle mask and then shuffle the
18075 // results into the target lanes.
18076 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18077 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18078 return V;
18079
18080 // If we have a single input shuffle with different shuffle patterns in the
18081 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
18082 if (V2.isUndef() &&
18083 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
18084 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
18085 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
18086 }
18087
18088 // If we have AVX512F support, we can use VEXPAND.
18089 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
18090 V1, V2, DAG, Subtarget))
18091 return V;
18092
18093 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
18094}
18095
18096/// Handle lowering of 8-lane 64-bit integer shuffles.
18097static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18098 const APInt &Zeroable, SDValue V1, SDValue V2,
18099 const X86Subtarget &Subtarget,
18100 SelectionDAG &DAG) {
18101 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18101, __extension__ __PRETTY_FUNCTION__))
;
18102 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18102, __extension__ __PRETTY_FUNCTION__))
;
18103 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18103, __extension__ __PRETTY_FUNCTION__))
;
18104
18105 if (V2.isUndef()) {
18106 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
18107 // can use lower latency instructions that will operate on all four
18108 // 128-bit lanes.
18109 SmallVector<int, 2> Repeated128Mask;
18110 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
18111 SmallVector<int, 4> PSHUFDMask;
18112 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
18113 return DAG.getBitcast(
18114 MVT::v8i64,
18115 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
18116 DAG.getBitcast(MVT::v16i32, V1),
18117 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
18118 }
18119
18120 SmallVector<int, 4> Repeated256Mask;
18121 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
18122 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
18123 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
18124 }
18125
18126 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
18127 V2, Subtarget, DAG))
18128 return Shuf128;
18129
18130 // Try to use shift instructions.
18131 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
18132 Zeroable, Subtarget, DAG))
18133 return Shift;
18134
18135 // Try to use VALIGN.
18136 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
18137 Subtarget, DAG))
18138 return Rotate;
18139
18140 // Try to use PALIGNR.
18141 if (Subtarget.hasBWI())
18142 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
18143 Subtarget, DAG))
18144 return Rotate;
18145
18146 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
18147 return Unpck;
18148
18149 // If we have AVX512F support, we can use VEXPAND.
18150 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
18151 DAG, Subtarget))
18152 return V;
18153
18154 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
18155 Zeroable, Subtarget, DAG))
18156 return Blend;
18157
18158 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
18159}
18160
18161/// Handle lowering of 16-lane 32-bit integer shuffles.
18162static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18163 const APInt &Zeroable, SDValue V1, SDValue V2,
18164 const X86Subtarget &Subtarget,
18165 SelectionDAG &DAG) {
18166 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18166, __extension__ __PRETTY_FUNCTION__))
;
18167 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18167, __extension__ __PRETTY_FUNCTION__))
;
18168 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18168, __extension__ __PRETTY_FUNCTION__))
;
18169
18170 // Whenever we can lower this as a zext, that instruction is strictly faster
18171 // than any alternative. It also allows us to fold memory operands into the
18172 // shuffle in many cases.
18173 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18174 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18175 return ZExt;
18176
18177 // If the shuffle mask is repeated in each 128-bit lane we can use more
18178 // efficient instructions that mirror the shuffles across the four 128-bit
18179 // lanes.
18180 SmallVector<int, 4> RepeatedMask;
18181 bool Is128BitLaneRepeatedShuffle =
18182 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
18183 if (Is128BitLaneRepeatedShuffle) {
18184 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18184, __extension__ __PRETTY_FUNCTION__))
;
18185 if (V2.isUndef())
18186 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
18187 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
18188
18189 // Use dedicated unpack instructions for masks that match their pattern.
18190 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
18191 return V;
18192 }
18193
18194 // Try to use shift instructions.
18195 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
18196 Zeroable, Subtarget, DAG))
18197 return Shift;
18198
18199 // Try to use VALIGN.
18200 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
18201 Subtarget, DAG))
18202 return Rotate;
18203
18204 // Try to use byte rotation instructions.
18205 if (Subtarget.hasBWI())
18206 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
18207 Subtarget, DAG))
18208 return Rotate;
18209
18210 // Assume that a single SHUFPS is faster than using a permv shuffle.
18211 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
18212 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
18213 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
18214 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
18215 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
18216 CastV1, CastV2, DAG);
18217 return DAG.getBitcast(MVT::v16i32, ShufPS);
18218 }
18219
18220 // Try to create an in-lane repeating shuffle mask and then shuffle the
18221 // results into the target lanes.
18222 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
18223 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18224 return V;
18225
18226 // If we have AVX512F support, we can use VEXPAND.
18227 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
18228 DAG, Subtarget))
18229 return V;
18230
18231 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
18232 Zeroable, Subtarget, DAG))
18233 return Blend;
18234
18235 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
18236}
18237
18238/// Handle lowering of 32-lane 16-bit integer shuffles.
18239static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18240 const APInt &Zeroable, SDValue V1, SDValue V2,
18241 const X86Subtarget &Subtarget,
18242 SelectionDAG &DAG) {
18243 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18243, __extension__ __PRETTY_FUNCTION__))
;
18244 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18244, __extension__ __PRETTY_FUNCTION__))
;
18245 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18245, __extension__ __PRETTY_FUNCTION__))
;
18246 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18246, __extension__ __PRETTY_FUNCTION__))
;
18247
18248 // Whenever we can lower this as a zext, that instruction is strictly faster
18249 // than any alternative. It also allows us to fold memory operands into the
18250 // shuffle in many cases.
18251 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
18252 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18253 return ZExt;
18254
18255 // Use dedicated unpack instructions for masks that match their pattern.
18256 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
18257 return V;
18258
18259 // Use dedicated pack instructions for masks that match their pattern.
18260 if (SDValue V =
18261 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
18262 return V;
18263
18264 // Try to use shift instructions.
18265 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
18266 Zeroable, Subtarget, DAG))
18267 return Shift;
18268
18269 // Try to use byte rotation instructions.
18270 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
18271 Subtarget, DAG))
18272 return Rotate;
18273
18274 if (V2.isUndef()) {
18275 // Try to use bit rotation instructions.
18276 if (SDValue Rotate =
18277 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
18278 return Rotate;
18279
18280 SmallVector<int, 8> RepeatedMask;
18281 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
18282 // As this is a single-input shuffle, the repeated mask should be
18283 // a strictly valid v8i16 mask that we can pass through to the v8i16
18284 // lowering to handle even the v32 case.
18285 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
18286 RepeatedMask, Subtarget, DAG);
18287 }
18288 }
18289
18290 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
18291 Zeroable, Subtarget, DAG))
18292 return Blend;
18293
18294 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
18295 Zeroable, Subtarget, DAG))
18296 return PSHUFB;
18297
18298 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
18299}
18300
18301/// Handle lowering of 64-lane 8-bit integer shuffles.
18302static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18303 const APInt &Zeroable, SDValue V1, SDValue V2,
18304 const X86Subtarget &Subtarget,
18305 SelectionDAG &DAG) {
18306 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18306, __extension__ __PRETTY_FUNCTION__))
;
1
'?' condition is true
18307 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18307, __extension__ __PRETTY_FUNCTION__))
;
2
'?' condition is true
18308 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18308, __extension__ __PRETTY_FUNCTION__))
;
3
Assuming the condition is true
4
'?' condition is true
18309 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18309, __extension__ __PRETTY_FUNCTION__))
;
5
Assuming the condition is true
6
'?' condition is true
18310
18311 // Whenever we can lower this as a zext, that instruction is strictly faster
18312 // than any alternative. It also allows us to fold memory operands into the
18313 // shuffle in many cases.
18314 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
7
Taking false branch
18315 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18316 return ZExt;
18317
18318 // Use dedicated unpack instructions for masks that match their pattern.
18319 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
8
Taking false branch
18320 return V;
18321
18322 // Use dedicated pack instructions for masks that match their pattern.
18323 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
9
Taking false branch
18324 Subtarget))
18325 return V;
18326
18327 // Try to use shift instructions.
18328 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
10
Taking false branch
18329 Zeroable, Subtarget, DAG))
18330 return Shift;
18331
18332 // Try to use byte rotation instructions.
18333 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
11
Taking false branch
18334 Subtarget, DAG))
18335 return Rotate;
18336
18337 // Try to use bit rotation instructions.
18338 if (V2.isUndef())
12
Taking false branch
18339 if (SDValue Rotate =
18340 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
18341 return Rotate;
18342
18343 // Lower as AND if possible.
18344 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
13
Taking false branch
18345 Zeroable, Subtarget, DAG))
18346 return Masked;
18347
18348 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
14
Taking false branch
18349 Zeroable, Subtarget, DAG))
18350 return PSHUFB;
18351
18352 // VBMI can use VPERMV/VPERMV3 byte shuffles.
18353 if (Subtarget.hasVBMI())
15
Assuming the condition is false
16
Taking false branch
18354 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
18355
18356 // Try to create an in-lane repeating shuffle mask and then shuffle the
18357 // results into the target lanes.
18358 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17
Taking false branch
18359 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18360 return V;
18361
18362 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
18
Taking false branch
18363 Zeroable, Subtarget, DAG))
18364 return Blend;
18365
18366 // Try to simplify this by merging 128-bit lanes to enable a lane-based
18367 // shuffle.
18368 if (!V2.isUndef())
19
Taking true branch
18369 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
20
Calling 'lowerShuffleAsLanePermuteAndRepeatedMask'
18370 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18371 return Result;
18372
18373 // FIXME: Implement direct support for this type!
18374 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
18375}
18376
18377/// High-level routine to lower various 512-bit x86 vector shuffles.
18378///
18379/// This routine either breaks down the specific type of a 512-bit x86 vector
18380/// shuffle or splits it into two 256-bit shuffles and fuses the results back
18381/// together based on the available instructions.
18382static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18383 MVT VT, SDValue V1, SDValue V2,
18384 const APInt &Zeroable,
18385 const X86Subtarget &Subtarget,
18386 SelectionDAG &DAG) {
18387 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18388, __extension__ __PRETTY_FUNCTION__))
18388 "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18388, __extension__ __PRETTY_FUNCTION__))
;
18389
18390 // If we have a single input to the zero element, insert that into V1 if we
18391 // can do so cheaply.
18392 int NumElts = Mask.size();
18393 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18394
18395 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18396 if (SDValue Insertion = lowerShuffleAsElementInsertion(
18397 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18398 return Insertion;
18399
18400 // Handle special cases where the lower or upper half is UNDEF.
18401 if (SDValue V =
18402 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
18403 return V;
18404
18405 // Check for being able to broadcast a single element.
18406 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
18407 Subtarget, DAG))
18408 return Broadcast;
18409
18410 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18411 // Try using bit ops for masking and blending before falling back to
18412 // splitting.
18413 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
18414 Subtarget, DAG))
18415 return V;
18416 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18417 return V;
18418
18419 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18420 }
18421
18422 if (VT == MVT::v32f16) {
18423 V1 = DAG.getBitcast(MVT::v32i16, V1);
18424 V2 = DAG.getBitcast(MVT::v32i16, V2);
18425 return DAG.getBitcast(MVT::v32f16,
18426 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
18427 }
18428
18429 // Dispatch to each element type for lowering. If we don't have support for
18430 // specific element type shuffles at 512 bits, immediately split them and
18431 // lower them. Each lowering routine of a given type is allowed to assume that
18432 // the requisite ISA extensions for that element type are available.
18433 switch (VT.SimpleTy) {
18434 case MVT::v8f64:
18435 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18436 case MVT::v16f32:
18437 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18438 case MVT::v8i64:
18439 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18440 case MVT::v16i32:
18441 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18442 case MVT::v32i16:
18443 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18444 case MVT::v64i8:
18445 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
18446
18447 default:
18448 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18448)
;
18449 }
18450}
18451
18452static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
18453 MVT VT, SDValue V1, SDValue V2,
18454 const X86Subtarget &Subtarget,
18455 SelectionDAG &DAG) {
18456 // Shuffle should be unary.
18457 if (!V2.isUndef())
18458 return SDValue();
18459
18460 int ShiftAmt = -1;
18461 int NumElts = Mask.size();
18462 for (int i = 0; i != NumElts; ++i) {
18463 int M = Mask[i];
18464 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18465, __extension__ __PRETTY_FUNCTION__))
18465 "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18465, __extension__ __PRETTY_FUNCTION__))
;
18466 if (M < 0)
18467 continue;
18468
18469 // The first non-undef element determines our shift amount.
18470 if (ShiftAmt < 0) {
18471 ShiftAmt = M - i;
18472 // Need to be shifting right.
18473 if (ShiftAmt <= 0)
18474 return SDValue();
18475 }
18476 // All non-undef elements must shift by the same amount.
18477 if (ShiftAmt != M - i)
18478 return SDValue();
18479 }
18480 assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18480, __extension__ __PRETTY_FUNCTION__))
;
18481
18482 // Great we found a shift right.
18483 MVT WideVT = VT;
18484 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18485 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18486 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18487 DAG.getUNDEF(WideVT), V1,
18488 DAG.getIntPtrConstant(0, DL));
18489 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
18490 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18491 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18492 DAG.getIntPtrConstant(0, DL));
18493}
18494
18495// Determine if this shuffle can be implemented with a KSHIFT instruction.
18496// Returns the shift amount if possible or -1 if not. This is a simplified
18497// version of matchShuffleAsShift.
18498static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
18499 int MaskOffset, const APInt &Zeroable) {
18500 int Size = Mask.size();
18501
18502 auto CheckZeros = [&](int Shift, bool Left) {
18503 for (int j = 0; j < Shift; ++j)
18504 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
18505 return false;
18506
18507 return true;
18508 };
18509
18510 auto MatchShift = [&](int Shift, bool Left) {
18511 unsigned Pos = Left ? Shift : 0;
18512 unsigned Low = Left ? 0 : Shift;
18513 unsigned Len = Size - Shift;
18514 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
18515 };
18516
18517 for (int Shift = 1; Shift != Size; ++Shift)
18518 for (bool Left : {true, false})
18519 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
18520 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18521 return Shift;
18522 }
18523
18524 return -1;
18525}
18526
18527
18528// Lower vXi1 vector shuffles.
18529// There is no a dedicated instruction on AVX-512 that shuffles the masks.
18530// The only way to shuffle bits is to sign-extend the mask vector to SIMD
18531// vector, shuffle and then truncate it back.
18532static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
18533 MVT VT, SDValue V1, SDValue V2,
18534 const APInt &Zeroable,
18535 const X86Subtarget &Subtarget,
18536 SelectionDAG &DAG) {
18537 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18538, __extension__ __PRETTY_FUNCTION__))
18538 "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18538, __extension__ __PRETTY_FUNCTION__))
;
18539
18540 int NumElts = Mask.size();
18541
18542 // Try to recognize shuffles that are just padding a subvector with zeros.
18543 int SubvecElts = 0;
18544 int Src = -1;
18545 for (int i = 0; i != NumElts; ++i) {
18546 if (Mask[i] >= 0) {
18547 // Grab the source from the first valid mask. All subsequent elements need
18548 // to use this same source.
18549 if (Src < 0)
18550 Src = Mask[i] / NumElts;
18551 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18552 break;
18553 }
18554
18555 ++SubvecElts;
18556 }
18557 assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18557, __extension__ __PRETTY_FUNCTION__))
;
18558
18559 // Clip to a power 2.
18560 SubvecElts = PowerOf2Floor(SubvecElts);
18561
18562 // Make sure the number of zeroable bits in the top at least covers the bits
18563 // not covered by the subvector.
18564 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
18565 assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18565, __extension__ __PRETTY_FUNCTION__))
;
18566 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18567 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
18568 Src == 0 ? V1 : V2,
18569 DAG.getIntPtrConstant(0, DL));
18570 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18571 DAG.getConstant(0, DL, VT),
18572 Extract, DAG.getIntPtrConstant(0, DL));
18573 }
18574
18575 // Try a simple shift right with undef elements. Later we'll try with zeros.
18576 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
18577 DAG))
18578 return Shift;
18579
18580 // Try to match KSHIFTs.
18581 unsigned Offset = 0;
18582 for (SDValue V : { V1, V2 }) {
18583 unsigned Opcode;
18584 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18585 if (ShiftAmt >= 0) {
18586 MVT WideVT = VT;
18587 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
18588 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18589 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
18590 DAG.getUNDEF(WideVT), V,
18591 DAG.getIntPtrConstant(0, DL));
18592 // Widened right shifts need two shifts to ensure we shift in zeroes.
18593 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18594 int WideElts = WideVT.getVectorNumElements();
18595 // Shift left to put the original vector in the MSBs of the new size.
18596 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18597 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18598 // Increase the shift amount to account for the left shift.
18599 ShiftAmt += WideElts - NumElts;
18600 }
18601
18602 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18603 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18604 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18605 DAG.getIntPtrConstant(0, DL));
18606 }
18607 Offset += NumElts; // Increment for next iteration.
18608 }
18609
18610
18611
18612 MVT ExtVT;
18613 switch (VT.SimpleTy) {
18614 default:
18615 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18615)
;
18616 case MVT::v2i1:
18617 ExtVT = MVT::v2i64;
18618 break;
18619 case MVT::v4i1:
18620 ExtVT = MVT::v4i32;
18621 break;
18622 case MVT::v8i1:
18623 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18624 // shuffle.
18625 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18626 break;
18627 case MVT::v16i1:
18628 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18629 // 256-bit operation available.
18630 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18631 break;
18632 case MVT::v32i1:
18633 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18634 // 256-bit operation available.
18635 assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18635, __extension__ __PRETTY_FUNCTION__))
;
18636 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18637 break;
18638 case MVT::v64i1:
18639 // Fall back to scalarization. FIXME: We can do better if the shuffle
18640 // can be partitioned cleanly.
18641 if (!Subtarget.useBWIRegs())
18642 return SDValue();
18643 ExtVT = MVT::v64i8;
18644 break;
18645 }
18646
18647 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18648 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18649
18650 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18651 // i1 was sign extended we can use X86ISD::CVT2MASK.
18652 int NumElems = VT.getVectorNumElements();
18653 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18654 (Subtarget.hasDQI() && (NumElems < 32)))
18655 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18656 Shuffle, ISD::SETGT);
18657
18658 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18659}
18660
18661/// Helper function that returns true if the shuffle mask should be
18662/// commuted to improve canonicalization.
18663static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
18664 int NumElements = Mask.size();
18665
18666 int NumV1Elements = 0, NumV2Elements = 0;
18667 for (int M : Mask)
18668 if (M < 0)
18669 continue;
18670 else if (M < NumElements)
18671 ++NumV1Elements;
18672 else
18673 ++NumV2Elements;
18674
18675 // Commute the shuffle as needed such that more elements come from V1 than
18676 // V2. This allows us to match the shuffle pattern strictly on how many
18677 // elements come from V1 without handling the symmetric cases.
18678 if (NumV2Elements > NumV1Elements)
18679 return true;
18680
18681 assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18681, __extension__ __PRETTY_FUNCTION__))
;
18682
18683 if (NumV2Elements == 0)
18684 return false;
18685
18686 // When the number of V1 and V2 elements are the same, try to minimize the
18687 // number of uses of V2 in the low half of the vector. When that is tied,
18688 // ensure that the sum of indices for V1 is equal to or lower than the sum
18689 // indices for V2. When those are equal, try to ensure that the number of odd
18690 // indices for V1 is lower than the number of odd indices for V2.
18691 if (NumV1Elements == NumV2Elements) {
18692 int LowV1Elements = 0, LowV2Elements = 0;
18693 for (int M : Mask.slice(0, NumElements / 2))
18694 if (M >= NumElements)
18695 ++LowV2Elements;
18696 else if (M >= 0)
18697 ++LowV1Elements;
18698 if (LowV2Elements > LowV1Elements)
18699 return true;
18700 if (LowV2Elements == LowV1Elements) {
18701 int SumV1Indices = 0, SumV2Indices = 0;
18702 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18703 if (Mask[i] >= NumElements)
18704 SumV2Indices += i;
18705 else if (Mask[i] >= 0)
18706 SumV1Indices += i;
18707 if (SumV2Indices < SumV1Indices)
18708 return true;
18709 if (SumV2Indices == SumV1Indices) {
18710 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18711 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18712 if (Mask[i] >= NumElements)
18713 NumV2OddIndices += i % 2;
18714 else if (Mask[i] >= 0)
18715 NumV1OddIndices += i % 2;
18716 if (NumV2OddIndices < NumV1OddIndices)
18717 return true;
18718 }
18719 }
18720 }
18721
18722 return false;
18723}
18724
18725// Forward declaration.
18726static SDValue canonicalizeShuffleMaskWithHorizOp(
18727 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
18728 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18729 const X86Subtarget &Subtarget);
18730
18731 /// Top-level lowering for x86 vector shuffles.
18732///
18733/// This handles decomposition, canonicalization, and lowering of all x86
18734/// vector shuffles. Most of the specific lowering strategies are encapsulated
18735/// above in helper routines. The canonicalization attempts to widen shuffles
18736/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18737/// s.t. only one of the two inputs needs to be tested, etc.
18738static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
18739 SelectionDAG &DAG) {
18740 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
18741 ArrayRef<int> OrigMask = SVOp->getMask();
18742 SDValue V1 = Op.getOperand(0);
18743 SDValue V2 = Op.getOperand(1);
18744 MVT VT = Op.getSimpleValueType();
18745 int NumElements = VT.getVectorNumElements();
18746 SDLoc DL(Op);
18747 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18748
18749 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18750, __extension__ __PRETTY_FUNCTION__))
18750 "Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18750, __extension__ __PRETTY_FUNCTION__))
;
18751
18752 bool V1IsUndef = V1.isUndef();
18753 bool V2IsUndef = V2.isUndef();
18754 if (V1IsUndef && V2IsUndef)
18755 return DAG.getUNDEF(VT);
18756
18757 // When we create a shuffle node we put the UNDEF node to second operand,
18758 // but in some cases the first operand may be transformed to UNDEF.
18759 // In this case we should just commute the node.
18760 if (V1IsUndef)
18761 return DAG.getCommutedVectorShuffle(*SVOp);
18762
18763 // Check for non-undef masks pointing at an undef vector and make the masks
18764 // undef as well. This makes it easier to match the shuffle based solely on
18765 // the mask.
18766 if (V2IsUndef &&
18767 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18768 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
18769 for (int &M : NewMask)
18770 if (M >= NumElements)
18771 M = -1;
18772 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18773 }
18774
18775 // Check for illegal shuffle mask element index values.
18776 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18777 (void)MaskUpperLimit;
18778 assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18780, __extension__ __PRETTY_FUNCTION__))
18779 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18780, __extension__ __PRETTY_FUNCTION__))
18780 "Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18780, __extension__ __PRETTY_FUNCTION__))
;
18781
18782 // We actually see shuffles that are entirely re-arrangements of a set of
18783 // zero inputs. This mostly happens while decomposing complex shuffles into
18784 // simple ones. Directly lower these as a buildvector of zeros.
18785 APInt KnownUndef, KnownZero;
18786 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18787
18788 APInt Zeroable = KnownUndef | KnownZero;
18789 if (Zeroable.isAllOnesValue())
18790 return getZeroVector(VT, Subtarget, DAG, DL);
18791
18792 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18793
18794 // Try to collapse shuffles into using a vector type with fewer elements but
18795 // wider element types. We cap this to not form integers or floating point
18796 // elements wider than 64 bits. It does not seem beneficial to form i128
18797 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18798 SmallVector<int, 16> WidenedMask;
18799 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18800 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18801 // Shuffle mask widening should not interfere with a broadcast opportunity
18802 // by obfuscating the operands with bitcasts.
18803 // TODO: Avoid lowering directly from this top-level function: make this
18804 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18805 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18806 Subtarget, DAG))
18807 return Broadcast;
18808
18809 MVT NewEltVT = VT.isFloatingPoint()
18810 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
18811 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
18812 int NewNumElts = NumElements / 2;
18813 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18814 // Make sure that the new vector type is legal. For example, v2f64 isn't
18815 // legal on SSE1.
18816 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18817 if (V2IsZero) {
18818 // Modify the new Mask to take all zeros from the all-zero vector.
18819 // Choose indices that are blend-friendly.
18820 bool UsedZeroVector = false;
18821 assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18822, __extension__ __PRETTY_FUNCTION__))
18822 "V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18822, __extension__ __PRETTY_FUNCTION__))
;
18823 for (int i = 0; i != NewNumElts; ++i)
18824 if (WidenedMask[i] == SM_SentinelZero) {
18825 WidenedMask[i] = i + NewNumElts;
18826 UsedZeroVector = true;
18827 }
18828 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18829 // some elements to be undef.
18830 if (UsedZeroVector)
18831 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18832 }
18833 V1 = DAG.getBitcast(NewVT, V1);
18834 V2 = DAG.getBitcast(NewVT, V2);
18835 return DAG.getBitcast(
18836 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18837 }
18838 }
18839
18840 SmallVector<SDValue> Ops = {V1, V2};
18841 SmallVector<int> Mask(OrigMask.begin(), OrigMask.end());
18842
18843 // Canonicalize the shuffle with any horizontal ops inputs.
18844 // NOTE: This may update Ops and Mask.
18845 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
18846 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18847 return DAG.getBitcast(VT, HOp);
18848
18849 V1 = DAG.getBitcast(VT, Ops[0]);
18850 V2 = DAG.getBitcast(VT, Ops[1]);
18851 assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18853, __extension__ __PRETTY_FUNCTION__))
18852 "canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18853, __extension__ __PRETTY_FUNCTION__))
18853 "shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18853, __extension__ __PRETTY_FUNCTION__))
;
18854
18855 // Commute the shuffle if it will improve canonicalization.
18856 if (canonicalizeShuffleMaskWithCommute(Mask)) {
18857 ShuffleVectorSDNode::commuteMask(Mask);
18858 std::swap(V1, V2);
18859 }
18860
18861 // For each vector width, delegate to a specialized lowering routine.
18862 if (VT.is128BitVector())
18863 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18864
18865 if (VT.is256BitVector())
18866 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18867
18868 if (VT.is512BitVector())
18869 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18870
18871 if (Is1BitVector)
18872 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18873
18874 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18874)
;
18875}
18876
18877/// Try to lower a VSELECT instruction to a vector shuffle.
18878static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
18879 const X86Subtarget &Subtarget,
18880 SelectionDAG &DAG) {
18881 SDValue Cond = Op.getOperand(0);
18882 SDValue LHS = Op.getOperand(1);
18883 SDValue RHS = Op.getOperand(2);
18884 MVT VT = Op.getSimpleValueType();
18885
18886 // Only non-legal VSELECTs reach this lowering, convert those into generic
18887 // shuffles and re-use the shuffle lowering path for blends.
18888 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
18889 SmallVector<int, 32> Mask;
18890 if (createShuffleMaskFromVSELECT(Mask, Cond))
18891 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18892 }
18893
18894 return SDValue();
18895}
18896
18897SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18898 SDValue Cond = Op.getOperand(0);
18899 SDValue LHS = Op.getOperand(1);
18900 SDValue RHS = Op.getOperand(2);
18901
18902 // A vselect where all conditions and data are constants can be optimized into
18903 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18904 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
18905 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
18906 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
18907 return SDValue();
18908
18909 // Try to lower this to a blend-style vector shuffle. This can handle all
18910 // constant condition cases.
18911 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18912 return BlendOp;
18913
18914 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18915 // with patterns on the mask registers on AVX-512.
18916 MVT CondVT = Cond.getSimpleValueType();
18917 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18918 if (CondEltSize == 1)
18919 return Op;
18920
18921 // Variable blends are only legal from SSE4.1 onward.
18922 if (!Subtarget.hasSSE41())
18923 return SDValue();
18924
18925 SDLoc dl(Op);
18926 MVT VT = Op.getSimpleValueType();
18927 unsigned EltSize = VT.getScalarSizeInBits();
18928 unsigned NumElts = VT.getVectorNumElements();
18929
18930 // Expand v32i16/v64i8 without BWI.
18931 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18932 return SDValue();
18933
18934 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18935 // into an i1 condition so that we can use the mask-based 512-bit blend
18936 // instructions.
18937 if (VT.getSizeInBits() == 512) {
18938 // Build a mask by testing the condition against zero.
18939 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18940 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18941 DAG.getConstant(0, dl, CondVT),
18942 ISD::SETNE);
18943 // Now return a new VSELECT using the mask.
18944 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18945 }
18946
18947 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18948 if (CondEltSize != EltSize) {
18949 // If we don't have a sign splat, rely on the expansion.
18950 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18951 return SDValue();
18952
18953 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18954 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18955 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18956 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18957 }
18958
18959 // Only some types will be legal on some subtargets. If we can emit a legal
18960 // VSELECT-matching blend, return Op, and but if we need to expand, return
18961 // a null value.
18962 switch (VT.SimpleTy) {
18963 default:
18964 // Most of the vector types have blends past SSE4.1.
18965 return Op;
18966
18967 case MVT::v32i8:
18968 // The byte blends for AVX vectors were introduced only in AVX2.
18969 if (Subtarget.hasAVX2())
18970 return Op;
18971
18972 return SDValue();
18973
18974 case MVT::v8i16:
18975 case MVT::v16i16: {
18976 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18977 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18978 Cond = DAG.getBitcast(CastVT, Cond);
18979 LHS = DAG.getBitcast(CastVT, LHS);
18980 RHS = DAG.getBitcast(CastVT, RHS);
18981 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18982 return DAG.getBitcast(VT, Select);
18983 }
18984 }
18985}
18986
18987static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
18988 MVT VT = Op.getSimpleValueType();
18989 SDValue Vec = Op.getOperand(0);
18990 SDValue Idx = Op.getOperand(1);
18991 assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18991, __extension__ __PRETTY_FUNCTION__))
;
18992 SDLoc dl(Op);
18993
18994 if (!Vec.getSimpleValueType().is128BitVector())
18995 return SDValue();
18996
18997 if (VT.getSizeInBits() == 8) {
18998 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18999 // we're going to zero extend the register or fold the store.
19000 if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
19001 !MayFoldIntoStore(Op))
19002 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
19003 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19004 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19005
19006 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
19007 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19008 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19009 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19010 }
19011
19012 if (VT == MVT::f32) {
19013 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
19014 // the result back to FR32 register. It's only worth matching if the
19015 // result has a single use which is a store or a bitcast to i32. And in
19016 // the case of a store, it's not worth it if the index is a constant 0,
19017 // because a MOVSSmr can be used instead, which is smaller and faster.
19018 if (!Op.hasOneUse())
19019 return SDValue();
19020 SDNode *User = *Op.getNode()->use_begin();
19021 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
19022 (User->getOpcode() != ISD::BITCAST ||
19023 User->getValueType(0) != MVT::i32))
19024 return SDValue();
19025 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19026 DAG.getBitcast(MVT::v4i32, Vec), Idx);
19027 return DAG.getBitcast(MVT::f32, Extract);
19028 }
19029
19030 if (VT == MVT::i32 || VT == MVT::i64)
19031 return Op;
19032
19033 return SDValue();
19034}
19035
19036/// Extract one bit from mask vector, like v16i1 or v8i1.
19037/// AVX-512 feature.
19038static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
19039 const X86Subtarget &Subtarget) {
19040 SDValue Vec = Op.getOperand(0);
19041 SDLoc dl(Vec);
19042 MVT VecVT = Vec.getSimpleValueType();
19043 SDValue Idx = Op.getOperand(1);
19044 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19045 MVT EltVT = Op.getSimpleValueType();
19046
19047 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19048, __extension__ __PRETTY_FUNCTION__))
19048 "Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19048, __extension__ __PRETTY_FUNCTION__))
;
19049
19050 // variable index can't be handled in mask registers,
19051 // extend vector to VR512/128
19052 if (!IdxC) {
19053 unsigned NumElts = VecVT.getVectorNumElements();
19054 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
19055 // than extending to 128/256bit.
19056 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19057 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19058 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
19059 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
19060 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
19061 }
19062
19063 unsigned IdxVal = IdxC->getZExtValue();
19064 if (IdxVal == 0) // the operation is legal
19065 return Op;
19066
19067 // Extend to natively supported kshift.
19068 unsigned NumElems = VecVT.getVectorNumElements();
19069 MVT WideVecVT = VecVT;
19070 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19071 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19072 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19073 DAG.getUNDEF(WideVecVT), Vec,
19074 DAG.getIntPtrConstant(0, dl));
19075 }
19076
19077 // Use kshiftr instruction to move to the lower element.
19078 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19079 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19080
19081 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19082 DAG.getIntPtrConstant(0, dl));
19083}
19084
19085SDValue
19086X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
19087 SelectionDAG &DAG) const {
19088 SDLoc dl(Op);
19089 SDValue Vec = Op.getOperand(0);
19090 MVT VecVT = Vec.getSimpleValueType();
19091 SDValue Idx = Op.getOperand(1);
19092 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
19093
19094 if (VecVT.getVectorElementType() == MVT::i1)
19095 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
19096
19097 if (!IdxC) {
19098 // Its more profitable to go through memory (1 cycles throughput)
19099 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
19100 // IACA tool was used to get performance estimation
19101 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
19102 //
19103 // example : extractelement <16 x i8> %a, i32 %i
19104 //
19105 // Block Throughput: 3.00 Cycles
19106 // Throughput Bottleneck: Port5
19107 //
19108 // | Num Of | Ports pressure in cycles | |
19109 // | Uops | 0 - DV | 5 | 6 | 7 | |
19110 // ---------------------------------------------
19111 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
19112 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
19113 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
19114 // Total Num Of Uops: 4
19115 //
19116 //
19117 // Block Throughput: 1.00 Cycles
19118 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
19119 //
19120 // | | Ports pressure in cycles | |
19121 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
19122 // ---------------------------------------------------------
19123 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
19124 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
19125 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
19126 // Total Num Of Uops: 4
19127
19128 return SDValue();
19129 }
19130
19131 unsigned IdxVal = IdxC->getZExtValue();
19132
19133 // If this is a 256-bit vector result, first extract the 128-bit vector and
19134 // then extract the element from the 128-bit vector.
19135 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
19136 // Get the 128-bit vector.
19137 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
19138 MVT EltVT = VecVT.getVectorElementType();
19139
19140 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
19141 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19141, __extension__ __PRETTY_FUNCTION__))
;
19142
19143 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
19144 // this can be done with a mask.
19145 IdxVal &= ElemsPerChunk - 1;
19146 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
19147 DAG.getIntPtrConstant(IdxVal, dl));
19148 }
19149
19150 assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19150, __extension__ __PRETTY_FUNCTION__))
;
19151
19152 MVT VT = Op.getSimpleValueType();
19153
19154 if (VT == MVT::i16) {
19155 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
19156 // we're going to zero extend the register or fold the store (SSE41 only).
19157 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
19158 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op))) {
19159 if (Subtarget.hasFP16())
19160 return Op;
19161
19162 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
19163 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19164 DAG.getBitcast(MVT::v4i32, Vec), Idx));
19165 }
19166
19167 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19168 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19169 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
19170 }
19171
19172 if (Subtarget.hasSSE41())
19173 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
19174 return Res;
19175
19176 // TODO: We only extract a single element from v16i8, we can probably afford
19177 // to be more aggressive here before using the default approach of spilling to
19178 // stack.
19179 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
19180 // Extract either the lowest i32 or any i16, and extract the sub-byte.
19181 int DWordIdx = IdxVal / 4;
19182 if (DWordIdx == 0) {
19183 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
19184 DAG.getBitcast(MVT::v4i32, Vec),
19185 DAG.getIntPtrConstant(DWordIdx, dl));
19186 int ShiftVal = (IdxVal % 4) * 8;
19187 if (ShiftVal != 0)
19188 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
19189 DAG.getConstant(ShiftVal, dl, MVT::i8));
19190 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19191 }
19192
19193 int WordIdx = IdxVal / 2;
19194 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
19195 DAG.getBitcast(MVT::v8i16, Vec),
19196 DAG.getIntPtrConstant(WordIdx, dl));
19197 int ShiftVal = (IdxVal % 2) * 8;
19198 if (ShiftVal != 0)
19199 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
19200 DAG.getConstant(ShiftVal, dl, MVT::i8));
19201 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19202 }
19203
19204 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
19205 if (IdxVal == 0)
19206 return Op;
19207
19208 // Shuffle the element to the lowest element, then movss or movsh.
19209 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
19210 Mask[0] = static_cast<int>(IdxVal);
19211 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19212 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19213 DAG.getIntPtrConstant(0, dl));
19214 }
19215
19216 if (VT.getSizeInBits() == 64) {
19217 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
19218 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
19219 // to match extract_elt for f64.
19220 if (IdxVal == 0)
19221 return Op;
19222
19223 // UNPCKHPD the element to the lowest double word, then movsd.
19224 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
19225 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
19226 int Mask[2] = { 1, -1 };
19227 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
19228 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
19229 DAG.getIntPtrConstant(0, dl));
19230 }
19231
19232 return SDValue();
19233}
19234
19235/// Insert one bit to mask vector, like v16i1 or v8i1.
19236/// AVX-512 feature.
19237static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
19238 const X86Subtarget &Subtarget) {
19239 SDLoc dl(Op);
19240 SDValue Vec = Op.getOperand(0);
19241 SDValue Elt = Op.getOperand(1);
19242 SDValue Idx = Op.getOperand(2);
19243 MVT VecVT = Vec.getSimpleValueType();
19244
19245 if (!isa<ConstantSDNode>(Idx)) {
19246 // Non constant index. Extend source and destination,
19247 // insert element and then truncate the result.
19248 unsigned NumElts = VecVT.getVectorNumElements();
19249 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
19250 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
19251 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
19252 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
19253 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
19254 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
19255 }
19256
19257 // Copy into a k-register, extract to v1i1 and insert_subvector.
19258 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
19259 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
19260}
19261
19262SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
19263 SelectionDAG &DAG) const {
19264 MVT VT = Op.getSimpleValueType();
19265 MVT EltVT = VT.getVectorElementType();
19266 unsigned NumElts = VT.getVectorNumElements();
19267 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
19268
19269 if (EltVT == MVT::i1)
19270 return InsertBitToMaskVector(Op, DAG, Subtarget);
19271
19272 SDLoc dl(Op);
19273 SDValue N0 = Op.getOperand(0);
19274 SDValue N1 = Op.getOperand(1);
19275 SDValue N2 = Op.getOperand(2);
19276 auto *N2C = dyn_cast<ConstantSDNode>(N2);
19277
19278 if (!N2C) {
19279 // Variable insertion indices, usually we're better off spilling to stack,
19280 // but AVX512 can use a variable compare+select by comparing against all
19281 // possible vector indices, and FP insertion has less gpr->simd traffic.
19282 if (!(Subtarget.hasBWI() ||
19283 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19284 (Subtarget.hasSSE41() && VT.isFloatingPoint())))
19285 return SDValue();
19286
19287 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
19288 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
19289 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
19290 return SDValue();
19291
19292 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
19293 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
19294 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
19295
19296 SmallVector<SDValue, 16> RawIndices;
19297 for (unsigned I = 0; I != NumElts; ++I)
19298 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
19299 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
19300
19301 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
19302 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19303 ISD::CondCode::SETEQ);
19304 }
19305
19306 if (N2C->getAPIntValue().uge(NumElts))
19307 return SDValue();
19308 uint64_t IdxVal = N2C->getZExtValue();
19309
19310 bool IsZeroElt = X86::isZeroNode(N1);
19311 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
19312
19313 // If we are inserting a element, see if we can do this more efficiently with
19314 // a blend shuffle with a rematerializable vector than a costly integer
19315 // insertion.
19316 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
19317 (16 <= EltSizeInBits || (IsZeroElt && !VT.is128BitVector()))) {
19318 SmallVector<int, 8> BlendMask;
19319 for (unsigned i = 0; i != NumElts; ++i)
19320 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19321 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19322 : getOnesVector(VT, DAG, dl);
19323 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19324 }
19325
19326 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19327 // into that, and then insert the subvector back into the result.
19328 if (VT.is256BitVector() || VT.is512BitVector()) {
19329 // With a 256-bit vector, we can insert into the zero element efficiently
19330 // using a blend if we have AVX or AVX2 and the right data type.
19331 if (VT.is256BitVector() && IdxVal == 0) {
19332 // TODO: It is worthwhile to cast integer to floating point and back
19333 // and incur a domain crossing penalty if that's what we'll end up
19334 // doing anyway after extracting to a 128-bit vector.
19335 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19336 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
19337 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19338 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19339 DAG.getTargetConstant(1, dl, MVT::i8));
19340 }
19341 }
19342
19343 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19344 assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19345, __extension__ __PRETTY_FUNCTION__))
19345 "Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19345, __extension__ __PRETTY_FUNCTION__))
;
19346
19347 // If we are not inserting into the low 128-bit vector chunk,
19348 // then prefer the broadcast+blend sequence.
19349 // FIXME: relax the profitability check iff all N1 uses are insertions.
19350 if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
19351 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19352 (Subtarget.hasAVX() && (EltSizeInBits >= 32) && MayFoldLoad(N1)))) {
19353 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19354 SmallVector<int, 8> BlendMask;
19355 for (unsigned i = 0; i != NumElts; ++i)
19356 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19357 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19358 }
19359
19360 // Get the desired 128-bit vector chunk.
19361 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19362
19363 // Insert the element into the desired chunk.
19364 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19365 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19366
19367 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19368 DAG.getIntPtrConstant(IdxIn128, dl));
19369
19370 // Insert the changed part back into the bigger vector
19371 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19372 }
19373 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19373, __extension__ __PRETTY_FUNCTION__))
;
19374
19375 // This will be just movw/movd/movq/movsh/movss/movsd.
19376 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19377 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19378 EltVT == MVT::f16 || EltVT == MVT::i64) {
19379 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19380 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19381 }
19382
19383 // We can't directly insert an i8 or i16 into a vector, so zero extend
19384 // it to i32 first.
19385 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19386 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19387 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19388 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19389 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19390 return DAG.getBitcast(VT, N1);
19391 }
19392 }
19393
19394 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19395 // argument. SSE41 required for pinsrb.
19396 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19397 unsigned Opc;
19398 if (VT == MVT::v8i16) {
19399 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19399, __extension__ __PRETTY_FUNCTION__))
;
19400 Opc = X86ISD::PINSRW;
19401 } else {
19402 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19402, __extension__ __PRETTY_FUNCTION__))
;
19403 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19403, __extension__ __PRETTY_FUNCTION__))
;
19404 Opc = X86ISD::PINSRB;
19405 }
19406
19407 assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19407, __extension__ __PRETTY_FUNCTION__))
;
19408 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19409 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19410 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19411 }
19412
19413 if (Subtarget.hasSSE41()) {
19414 if (EltVT == MVT::f32) {
19415 // Bits [7:6] of the constant are the source select. This will always be
19416 // zero here. The DAG Combiner may combine an extract_elt index into
19417 // these bits. For example (insert (extract, 3), 2) could be matched by
19418 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19419 // Bits [5:4] of the constant are the destination select. This is the
19420 // value of the incoming immediate.
19421 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19422 // combine either bitwise AND or insert of float 0.0 to set these bits.
19423
19424 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19425 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
19426 // If this is an insertion of 32-bits into the low 32-bits of
19427 // a vector, we prefer to generate a blend with immediate rather
19428 // than an insertps. Blends are simpler operations in hardware and so
19429 // will always have equal or better performance than insertps.
19430 // But if optimizing for size and there's a load folding opportunity,
19431 // generate insertps because blendps does not have a 32-bit memory
19432 // operand form.
19433 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19434 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19435 DAG.getTargetConstant(1, dl, MVT::i8));
19436 }
19437 // Create this as a scalar to vector..
19438 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19439 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19440 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19441 }
19442
19443 // PINSR* works with constant index.
19444 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19445 return Op;
19446 }
19447
19448 return SDValue();
19449}
19450
19451static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
19452 SelectionDAG &DAG) {
19453 SDLoc dl(Op);
19454 MVT OpVT = Op.getSimpleValueType();
19455
19456 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19457 // combines.
19458 if (X86::isZeroNode(Op.getOperand(0)))
19459 return getZeroVector(OpVT, Subtarget, DAG, dl);
19460
19461 // If this is a 256-bit vector result, first insert into a 128-bit
19462 // vector and then insert into the 256-bit vector.
19463 if (!OpVT.is128BitVector()) {
19464 // Insert into a 128-bit vector.
19465 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19466 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
19467 OpVT.getVectorNumElements() / SizeFactor);
19468
19469 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19470
19471 // Insert the 128-bit vector.
19472 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19473 }
19474 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19475, __extension__ __PRETTY_FUNCTION__))
19475 "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19475, __extension__ __PRETTY_FUNCTION__))
;
19476
19477 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19478 // tblgen.
19479 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19480 return Op;
19481
19482 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19483 return DAG.getBitcast(
19484 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19485}
19486
19487// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19488// simple superregister reference or explicit instructions to insert
19489// the upper bits of a vector.
19490static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19491 SelectionDAG &DAG) {
19492 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19492, __extension__ __PRETTY_FUNCTION__))
;
19493
19494 return insert1BitVector(Op, DAG, Subtarget);
19495}
19496
19497static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
19498 SelectionDAG &DAG) {
19499 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19500, __extension__ __PRETTY_FUNCTION__))
19500 "Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19500, __extension__ __PRETTY_FUNCTION__))
;
19501
19502 SDLoc dl(Op);
19503 SDValue Vec = Op.getOperand(0);
19504 uint64_t IdxVal = Op.getConstantOperandVal(1);
19505
19506 if (IdxVal == 0) // the operation is legal
19507 return Op;
19508
19509 MVT VecVT = Vec.getSimpleValueType();
19510 unsigned NumElems = VecVT.getVectorNumElements();
19511
19512 // Extend to natively supported kshift.
19513 MVT WideVecVT = VecVT;
19514 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
19515 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
19516 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
19517 DAG.getUNDEF(WideVecVT), Vec,
19518 DAG.getIntPtrConstant(0, dl));
19519 }
19520
19521 // Shift to the LSB.
19522 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
19523 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19524
19525 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19526 DAG.getIntPtrConstant(0, dl));
19527}
19528
19529// Returns the appropriate wrapper opcode for a global reference.
19530unsigned X86TargetLowering::getGlobalWrapperKind(
19531 const GlobalValue *GV, const unsigned char OpFlags) const {
19532 // References to absolute symbols are never PC-relative.
19533 if (GV && GV->isAbsoluteSymbolRef())
19534 return X86ISD::Wrapper;
19535
19536 CodeModel::Model M = getTargetMachine().getCodeModel();
19537 if (Subtarget.isPICStyleRIPRel() &&
19538 (M == CodeModel::Small || M == CodeModel::Kernel))
19539 return X86ISD::WrapperRIP;
19540
19541 // GOTPCREL references must always use RIP.
19542 if (OpFlags == X86II::MO_GOTPCREL)
19543 return X86ISD::WrapperRIP;
19544
19545 return X86ISD::Wrapper;
19546}
19547
19548// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19549// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19550// one of the above mentioned nodes. It has to be wrapped because otherwise
19551// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19552// be used to form addressing mode. These wrapped nodes will be selected
19553// into MOV32ri.
19554SDValue
19555X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19556 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19557
19558 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19559 // global base reg.
19560 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19561
19562 auto PtrVT = getPointerTy(DAG.getDataLayout());
19563 SDValue Result = DAG.getTargetConstantPool(
19564 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19565 SDLoc DL(CP);
19566 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19567 // With PIC, the address is actually $g + Offset.
19568 if (OpFlag) {
19569 Result =
19570 DAG.getNode(ISD::ADD, DL, PtrVT,
19571 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19572 }
19573
19574 return Result;
19575}
19576
19577SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19578 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19579
19580 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19581 // global base reg.
19582 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19583
19584 auto PtrVT = getPointerTy(DAG.getDataLayout());
19585 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19586 SDLoc DL(JT);
19587 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
19588
19589 // With PIC, the address is actually $g + Offset.
19590 if (OpFlag)
19591 Result =
19592 DAG.getNode(ISD::ADD, DL, PtrVT,
19593 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19594
19595 return Result;
19596}
19597
19598SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19599 SelectionDAG &DAG) const {
19600 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19601}
19602
19603SDValue
19604X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19605 // Create the TargetBlockAddressAddress node.
19606 unsigned char OpFlags =
19607 Subtarget.classifyBlockAddressReference();
19608 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19609 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19610 SDLoc dl(Op);
19611 auto PtrVT = getPointerTy(DAG.getDataLayout());
19612 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19613 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
19614
19615 // With PIC, the address is actually $g + Offset.
19616 if (isGlobalRelativeToPICBase(OpFlags)) {
19617 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19618 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19619 }
19620
19621 return Result;
19622}
19623
19624/// Creates target global address or external symbol nodes for calls or
19625/// other uses.
19626SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19627 bool ForCall) const {
19628 // Unpack the global address or external symbol.
19629 const SDLoc &dl = SDLoc(Op);
19630 const GlobalValue *GV = nullptr;
19631 int64_t Offset = 0;
19632 const char *ExternalSym = nullptr;
19633 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19634 GV = G->getGlobal();
19635 Offset = G->getOffset();
19636 } else {
19637 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19638 ExternalSym = ES->getSymbol();
19639 }
19640
19641 // Calculate some flags for address lowering.
19642 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
19643 unsigned char OpFlags;
19644 if (ForCall)
19645 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19646 else
19647 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19648 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19649 bool NeedsLoad = isGlobalStubReference(OpFlags);
19650
19651 CodeModel::Model M = DAG.getTarget().getCodeModel();
19652 auto PtrVT = getPointerTy(DAG.getDataLayout());
19653 SDValue Result;
19654
19655 if (GV) {
19656 // Create a target global address if this is a global. If possible, fold the
19657 // offset into the global address reference. Otherwise, ADD it on later.
19658 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19659 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19660 // relocation will compute to a negative value, which is invalid.
19661 int64_t GlobalOffset = 0;
19662 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19663 X86::isOffsetSuitableForCodeModel(Offset, M, true)) {
19664 std::swap(GlobalOffset, Offset);
19665 }
19666 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19667 } else {
19668 // If this is not a global address, this must be an external symbol.
19669 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19670 }
19671
19672 // If this is a direct call, avoid the wrapper if we don't need to do any
19673 // loads or adds. This allows SDAG ISel to match direct calls.
19674 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19675 return Result;
19676
19677 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19678
19679 // With PIC, the address is actually $g + Offset.
19680 if (HasPICReg) {
19681 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19682 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19683 }
19684
19685 // For globals that require a load from a stub to get the address, emit the
19686 // load.
19687 if (NeedsLoad)
19688 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19689 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19690
19691 // If there was a non-zero offset that we didn't fold, create an explicit
19692 // addition for it.
19693 if (Offset != 0)
19694 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19695 DAG.getConstant(Offset, dl, PtrVT));
19696
19697 return Result;
19698}
19699
19700SDValue
19701X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19702 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19703}
19704
19705static SDValue
19706GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
19707 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
19708 unsigned char OperandFlags, bool LocalDynamic = false) {
19709 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19710 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19711 SDLoc dl(GA);
19712 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19713 GA->getValueType(0),
19714 GA->getOffset(),
19715 OperandFlags);
19716
19717 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
19718 : X86ISD::TLSADDR;
19719
19720 if (InFlag) {
19721 SDValue Ops[] = { Chain, TGA, *InFlag };
19722 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19723 } else {
19724 SDValue Ops[] = { Chain, TGA };
19725 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
19726 }
19727
19728 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19729 MFI.setAdjustsStack(true);
19730 MFI.setHasCalls(true);
19731
19732 SDValue Flag = Chain.getValue(1);
19733 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
19734}
19735
19736// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19737static SDValue
19738LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19739 const EVT PtrVT) {
19740 SDValue InFlag;
19741 SDLoc dl(GA); // ? function entry point might be better
19742 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19743 DAG.getNode(X86ISD::GlobalBaseReg,
19744 SDLoc(), PtrVT), InFlag);
19745 InFlag = Chain.getValue(1);
19746
19747 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
19748}
19749
19750// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19751static SDValue
19752LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19753 const EVT PtrVT) {
19754 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19755 X86::RAX, X86II::MO_TLSGD);
19756}
19757
19758// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19759static SDValue
19760LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19761 const EVT PtrVT) {
19762 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
19763 X86::EAX, X86II::MO_TLSGD);
19764}
19765
19766static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
19767 SelectionDAG &DAG, const EVT PtrVT,
19768 bool Is64Bit, bool Is64BitLP64) {
19769 SDLoc dl(GA);
19770
19771 // Get the start address of the TLS block for this module.
19772 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
19773 .getInfo<X86MachineFunctionInfo>();
19774 MFI->incNumLocalDynamicTLSAccesses();
19775
19776 SDValue Base;
19777 if (Is64Bit) {
19778 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19779 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
19780 X86II::MO_TLSLD, /*LocalDynamic=*/true);
19781 } else {
19782 SDValue InFlag;
19783 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
19784 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
19785 InFlag = Chain.getValue(1);
19786 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
19787 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
19788 }
19789
19790 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19791 // of Base.
19792
19793 // Build x@dtpoff.
19794 unsigned char OperandFlags = X86II::MO_DTPOFF;
19795 unsigned WrapperKind = X86ISD::Wrapper;
19796 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19797 GA->getValueType(0),
19798 GA->getOffset(), OperandFlags);
19799 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19800
19801 // Add x@dtpoff with the base.
19802 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19803}
19804
19805// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19806static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
19807 const EVT PtrVT, TLSModel::Model model,
19808 bool is64Bit, bool isPIC) {
19809 SDLoc dl(GA);
19810
19811 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19812 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
19813 is64Bit ? 257 : 256));
19814
19815 SDValue ThreadPointer =
19816 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19817 MachinePointerInfo(Ptr));
19818
19819 unsigned char OperandFlags = 0;
19820 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19821 // initialexec.
19822 unsigned WrapperKind = X86ISD::Wrapper;
19823 if (model == TLSModel::LocalExec) {
19824 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19825 } else if (model == TLSModel::InitialExec) {
19826 if (is64Bit) {
19827 OperandFlags = X86II::MO_GOTTPOFF;
19828 WrapperKind = X86ISD::WrapperRIP;
19829 } else {
19830 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19831 }
19832 } else {
19833 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19833)
;
19834 }
19835
19836 // emit "addl x@ntpoff,%eax" (local exec)
19837 // or "addl x@indntpoff,%eax" (initial exec)
19838 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19839 SDValue TGA =
19840 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19841 GA->getOffset(), OperandFlags);
19842 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19843
19844 if (model == TLSModel::InitialExec) {
19845 if (isPIC && !is64Bit) {
19846 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19847 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19848 Offset);
19849 }
19850
19851 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19852 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
19853 }
19854
19855 // The address of the thread local variable is the add of the thread
19856 // pointer with the offset of the variable.
19857 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19858}
19859
19860SDValue
19861X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19862
19863 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19864
19865 if (DAG.getTarget().useEmulatedTLS())
19866 return LowerToTLSEmulatedModel(GA, DAG);
19867
19868 const GlobalValue *GV = GA->getGlobal();
19869 auto PtrVT = getPointerTy(DAG.getDataLayout());
19870 bool PositionIndependent = isPositionIndependent();
19871
19872 if (Subtarget.isTargetELF()) {
19873 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19874 switch (model) {
19875 case TLSModel::GeneralDynamic:
19876 if (Subtarget.is64Bit()) {
19877 if (Subtarget.isTarget64BitLP64())
19878 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19879 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19880 }
19881 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19882 case TLSModel::LocalDynamic:
19883 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19884 Subtarget.isTarget64BitLP64());
19885 case TLSModel::InitialExec:
19886 case TLSModel::LocalExec:
19887 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19888 PositionIndependent);
19889 }
19890 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19890)
;
19891 }
19892
19893 if (Subtarget.isTargetDarwin()) {
19894 // Darwin only has one model of TLS. Lower to that.
19895 unsigned char OpFlag = 0;
19896 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
19897 X86ISD::WrapperRIP : X86ISD::Wrapper;
19898
19899 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19900 // global base reg.
19901 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19902 if (PIC32)
19903 OpFlag = X86II::MO_TLVP_PIC_BASE;
19904 else
19905 OpFlag = X86II::MO_TLVP;
19906 SDLoc DL(Op);
19907 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
19908 GA->getValueType(0),
19909 GA->getOffset(), OpFlag);
19910 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19911
19912 // With PIC32, the address is actually $g + Offset.
19913 if (PIC32)
19914 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19915 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19916 Offset);
19917
19918 // Lowering the machine isd will make sure everything is in the right
19919 // location.
19920 SDValue Chain = DAG.getEntryNode();
19921 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19922 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19923 SDValue Args[] = { Chain, Offset };
19924 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19925 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
19926 DAG.getIntPtrConstant(0, DL, true),
19927 Chain.getValue(1), DL);
19928
19929 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19930 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19931 MFI.setAdjustsStack(true);
19932
19933 // And our return value (tls address) is in the standard call return value
19934 // location.
19935 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19936 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19937 }
19938
19939 if (Subtarget.isOSWindows()) {
19940 // Just use the implicit TLS architecture
19941 // Need to generate something similar to:
19942 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19943 // ; from TEB
19944 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19945 // mov rcx, qword [rdx+rcx*8]
19946 // mov eax, .tls$:tlsvar
19947 // [rax+rcx] contains the address
19948 // Windows 64bit: gs:0x58
19949 // Windows 32bit: fs:__tls_array
19950
19951 SDLoc dl(GA);
19952 SDValue Chain = DAG.getEntryNode();
19953
19954 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19955 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19956 // use its literal value of 0x2C.
19957 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
19958 ? Type::getInt8PtrTy(*DAG.getContext(),
19959 256)
19960 : Type::getInt32PtrTy(*DAG.getContext(),
19961 257));
19962
19963 SDValue TlsArray = Subtarget.is64Bit()
19964 ? DAG.getIntPtrConstant(0x58, dl)
19965 : (Subtarget.isTargetWindowsGNU()
19966 ? DAG.getIntPtrConstant(0x2C, dl)
19967 : DAG.getExternalSymbol("_tls_array", PtrVT));
19968
19969 SDValue ThreadPointer =
19970 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19971
19972 SDValue res;
19973 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
19974 res = ThreadPointer;
19975 } else {
19976 // Load the _tls_index variable
19977 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19978 if (Subtarget.is64Bit())
19979 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19980 MachinePointerInfo(), MVT::i32);
19981 else
19982 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19983
19984 const DataLayout &DL = DAG.getDataLayout();
19985 SDValue Scale =
19986 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19987 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19988
19989 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19990 }
19991
19992 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19993
19994 // Get the offset of start of .tls section
19995 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19996 GA->getValueType(0),
19997 GA->getOffset(), X86II::MO_SECREL);
19998 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19999
20000 // The address of the thread local variable is the add of the thread
20001 // pointer with the offset of the variable.
20002 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
20003 }
20004
20005 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20005)
;
20006}
20007
20008/// Lower SRA_PARTS and friends, which return two i32 values
20009/// and take a 2 x i32 value to shift plus a shift amount.
20010/// TODO: Can this be moved to general expansion code?
20011static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
20012 SDValue Lo, Hi;
20013 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
20014 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
20015}
20016
20017static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
20018 SelectionDAG &DAG) {
20019 MVT VT = Op.getSimpleValueType();
20020 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20021, __extension__ __PRETTY_FUNCTION__))
20021 "Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20021, __extension__ __PRETTY_FUNCTION__))
;
20022
20023 SDLoc DL(Op);
20024 SDValue Op0 = Op.getOperand(0);
20025 SDValue Op1 = Op.getOperand(1);
20026 SDValue Amt = Op.getOperand(2);
20027
20028 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
20029
20030 if (VT.isVector()) {
20031 assert(Subtarget.hasVBMI2() && "Expected VBMI2")(static_cast <bool> (Subtarget.hasVBMI2() && "Expected VBMI2"
) ? void (0) : __assert_fail ("Subtarget.hasVBMI2() && \"Expected VBMI2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20031, __extension__ __PRETTY_FUNCTION__))
;
20032
20033 if (IsFSHR)
20034 std::swap(Op0, Op1);
20035
20036 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20037 if (!Subtarget.hasVLX() && !VT.is512BitVector()) {
20038 Op0 = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
20039 Op1 = widenSubVector(Op1, false, Subtarget, DAG, DL, 512);
20040 }
20041
20042 SDValue Funnel;
20043 APInt APIntShiftAmt;
20044 MVT ResultVT = Op0.getSimpleValueType();
20045 if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
20046 uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
20047 Funnel =
20048 DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, ResultVT, Op0,
20049 Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
20050 } else {
20051 if (!Subtarget.hasVLX() && !VT.is512BitVector())
20052 Amt = widenSubVector(Amt, false, Subtarget, DAG, DL, 512);
20053 Funnel = DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL,
20054 ResultVT, Op0, Op1, Amt);
20055 }
20056 if (!Subtarget.hasVLX() && !VT.is512BitVector())
20057 Funnel = extractSubVector(Funnel, 0, DAG, DL, VT.getSizeInBits());
20058 return Funnel;
20059 }
20060 assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20062, __extension__ __PRETTY_FUNCTION__))
20061 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20062, __extension__ __PRETTY_FUNCTION__))
20062 "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20062, __extension__ __PRETTY_FUNCTION__))
;
20063
20064 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
20065 bool OptForSize = DAG.shouldOptForSize();
20066 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
20067
20068 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
20069 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
20070 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
20071 !isa<ConstantSDNode>(Amt)) {
20072 unsigned EltSizeInBits = VT.getScalarSizeInBits();
20073 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
20074 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
20075 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
20076 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
20077 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
20078 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
20079 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
20080 if (IsFSHR) {
20081 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
20082 } else {
20083 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
20084 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
20085 }
20086 return DAG.getZExtOrTrunc(Res, DL, VT);
20087 }
20088
20089 if (VT == MVT::i8 || ExpandFunnel)
20090 return SDValue();
20091
20092 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
20093 if (VT == MVT::i16) {
20094 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
20095 DAG.getConstant(15, DL, Amt.getValueType()));
20096 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
20097 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
20098 }
20099
20100 return Op;
20101}
20102
20103// Try to use a packed vector operation to handle i64 on 32-bit targets when
20104// AVX512DQ is enabled.
20105static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
20106 const X86Subtarget &Subtarget) {
20107 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20111, __extension__ __PRETTY_FUNCTION__))
20108 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20111, __extension__ __PRETTY_FUNCTION__))
20109 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20111, __extension__ __PRETTY_FUNCTION__))
20110 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20111, __extension__ __PRETTY_FUNCTION__))
20111 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20111, __extension__ __PRETTY_FUNCTION__))
;
20112 bool IsStrict = Op->isStrictFPOpcode();
20113 unsigned OpNo = IsStrict ? 1 : 0;
20114 SDValue Src = Op.getOperand(OpNo);
20115 MVT SrcVT = Src.getSimpleValueType();
20116 MVT VT = Op.getSimpleValueType();
20117
20118 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20119 (VT != MVT::f32 && VT != MVT::f64))
20120 return SDValue();
20121
20122 // Pack the i64 into a vector, do the operation and extract.
20123
20124 // Using 256-bit to ensure result is 128-bits for f32 case.
20125 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20126 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
20127 MVT VecVT = MVT::getVectorVT(VT, NumElts);
20128
20129 SDLoc dl(Op);
20130 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
20131 if (IsStrict) {
20132 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
20133 {Op.getOperand(0), InVec});
20134 SDValue Chain = CvtVec.getValue(1);
20135 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20136 DAG.getIntPtrConstant(0, dl));
20137 return DAG.getMergeValues({Value, Chain}, dl);
20138 }
20139
20140 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
20141
20142 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20143 DAG.getIntPtrConstant(0, dl));
20144}
20145
20146// Try to use a packed vector operation to handle i64 on 32-bit targets.
20147static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
20148 const X86Subtarget &Subtarget) {
20149 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20153, __extension__ __PRETTY_FUNCTION__))
20150 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20153, __extension__ __PRETTY_FUNCTION__))
20151 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20153, __extension__ __PRETTY_FUNCTION__))
20152 Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20153, __extension__ __PRETTY_FUNCTION__))
20153 "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20153, __extension__ __PRETTY_FUNCTION__))
;
20154 bool IsStrict = Op->isStrictFPOpcode();
20155 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20156 MVT SrcVT = Src.getSimpleValueType();
20157 MVT VT = Op.getSimpleValueType();
20158
20159 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20160 return SDValue();
20161
20162 // Pack the i64 into a vector, do the operation and extract.
20163
20164 assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20164, __extension__ __PRETTY_FUNCTION__))
;
20165
20166 SDLoc dl(Op);
20167 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
20168 if (IsStrict) {
20169 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20170 {Op.getOperand(0), InVec});
20171 SDValue Chain = CvtVec.getValue(1);
20172 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20173 DAG.getIntPtrConstant(0, dl));
20174 return DAG.getMergeValues({Value, Chain}, dl);
20175 }
20176
20177 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
20178
20179 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
20180 DAG.getIntPtrConstant(0, dl));
20181}
20182
20183static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
20184 const X86Subtarget &Subtarget) {
20185 switch (Opcode) {
20186 case ISD::SINT_TO_FP:
20187 // TODO: Handle wider types with AVX/AVX512.
20188 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
20189 return false;
20190 // CVTDQ2PS or (V)CVTDQ2PD
20191 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
20192
20193 case ISD::UINT_TO_FP:
20194 // TODO: Handle wider types and i64 elements.
20195 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
20196 return false;
20197 // VCVTUDQ2PS or VCVTUDQ2PD
20198 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20199
20200 default:
20201 return false;
20202 }
20203}
20204
20205/// Given a scalar cast operation that is extracted from a vector, try to
20206/// vectorize the cast op followed by extraction. This will avoid an expensive
20207/// round-trip between XMM and GPR.
20208static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
20209 const X86Subtarget &Subtarget) {
20210 // TODO: This could be enhanced to handle smaller integer types by peeking
20211 // through an extend.
20212 SDValue Extract = Cast.getOperand(0);
20213 MVT DestVT = Cast.getSimpleValueType();
20214 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20215 !isa<ConstantSDNode>(Extract.getOperand(1)))
20216 return SDValue();
20217
20218 // See if we have a 128-bit vector cast op for this type of cast.
20219 SDValue VecOp = Extract.getOperand(0);
20220 MVT FromVT = VecOp.getSimpleValueType();
20221 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
20222 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
20223 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
20224 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
20225 return SDValue();
20226
20227 // If we are extracting from a non-zero element, first shuffle the source
20228 // vector to allow extracting from element zero.
20229 SDLoc DL(Cast);
20230 if (!isNullConstant(Extract.getOperand(1))) {
20231 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
20232 Mask[0] = Extract.getConstantOperandVal(1);
20233 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
20234 }
20235 // If the source vector is wider than 128-bits, extract the low part. Do not
20236 // create an unnecessarily wide vector cast op.
20237 if (FromVT != Vec128VT)
20238 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
20239
20240 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
20241 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
20242 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
20243 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
20244 DAG.getIntPtrConstant(0, DL));
20245}
20246
20247/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
20248/// try to vectorize the cast ops. This will avoid an expensive round-trip
20249/// between XMM and GPR.
20250static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
20251 const X86Subtarget &Subtarget) {
20252 // TODO: Allow FP_TO_UINT.
20253 SDValue CastToInt = CastToFP.getOperand(0);
20254 MVT VT = CastToFP.getSimpleValueType();
20255 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
20256 return SDValue();
20257
20258 MVT IntVT = CastToInt.getSimpleValueType();
20259 SDValue X = CastToInt.getOperand(0);
20260 MVT SrcVT = X.getSimpleValueType();
20261 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20262 return SDValue();
20263
20264 // See if we have 128-bit vector cast instructions for this type of cast.
20265 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
20266 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20267 IntVT != MVT::i32)
20268 return SDValue();
20269
20270 unsigned SrcSize = SrcVT.getSizeInBits();
20271 unsigned IntSize = IntVT.getSizeInBits();
20272 unsigned VTSize = VT.getSizeInBits();
20273 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
20274 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
20275 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
20276
20277 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
20278 unsigned ToIntOpcode =
20279 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
20280 unsigned ToFPOpcode =
20281 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
20282
20283 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
20284 //
20285 // We are not defining the high elements (for example, zero them) because
20286 // that could nullify any performance advantage that we hoped to gain from
20287 // this vector op hack. We do not expect any adverse effects (like denorm
20288 // penalties) with cast ops.
20289 SDLoc DL(CastToFP);
20290 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
20291 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
20292 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
20293 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
20294 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
20295}
20296
20297static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
20298 const X86Subtarget &Subtarget) {
20299 SDLoc DL(Op);
20300 bool IsStrict = Op->isStrictFPOpcode();
20301 MVT VT = Op->getSimpleValueType(0);
20302 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
20303
20304 if (Subtarget.hasDQI()) {
20305 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20305, __extension__ __PRETTY_FUNCTION__))
;
20306
20307 assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20309, __extension__ __PRETTY_FUNCTION__))
20308 Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20309, __extension__ __PRETTY_FUNCTION__))
20309 "Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20309, __extension__ __PRETTY_FUNCTION__))
;
20310
20311 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
20312 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20313, __extension__ __PRETTY_FUNCTION__))
20313 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20313, __extension__ __PRETTY_FUNCTION__))
;
20314 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20315
20316 // Need to concat with zero vector for strict fp to avoid spurious
20317 // exceptions.
20318 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
20319 : DAG.getUNDEF(MVT::v8i64);
20320 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
20321 DAG.getIntPtrConstant(0, DL));
20322 SDValue Res, Chain;
20323 if (IsStrict) {
20324 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
20325 {Op->getOperand(0), Src});
20326 Chain = Res.getValue(1);
20327 } else {
20328 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
20329 }
20330
20331 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20332 DAG.getIntPtrConstant(0, DL));
20333
20334 if (IsStrict)
20335 return DAG.getMergeValues({Res, Chain}, DL);
20336 return Res;
20337 }
20338
20339 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20340 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20341 if (VT != MVT::v4f32 || IsSigned)
20342 return SDValue();
20343
20344 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20345 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20346 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20347 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20348 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20349 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20350 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20351 SmallVector<SDValue, 4> SignCvts(4);
20352 SmallVector<SDValue, 4> Chains(4);
20353 for (int i = 0; i != 4; ++i) {
20354 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20355 DAG.getIntPtrConstant(i, DL));
20356 if (IsStrict) {
20357 SignCvts[i] =
20358 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20359 {Op.getOperand(0), Elt});
20360 Chains[i] = SignCvts[i].getValue(1);
20361 } else {
20362 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20363 }
20364 }
20365 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20366
20367 SDValue Slow, Chain;
20368 if (IsStrict) {
20369 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20370 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20371 {Chain, SignCvt, SignCvt});
20372 Chain = Slow.getValue(1);
20373 } else {
20374 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20375 }
20376
20377 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20378 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20379
20380 if (IsStrict)
20381 return DAG.getMergeValues({Cvt, Chain}, DL);
20382
20383 return Cvt;
20384}
20385
20386SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20387 SelectionDAG &DAG) const {
20388 bool IsStrict = Op->isStrictFPOpcode();
20389 unsigned OpNo = IsStrict ? 1 : 0;
20390 SDValue Src = Op.getOperand(OpNo);
20391 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20392 MVT SrcVT = Src.getSimpleValueType();
20393 MVT VT = Op.getSimpleValueType();
20394 SDLoc dl(Op);
20395
20396 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20397 return Extract;
20398
20399 if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
20400 return R;
20401
20402 if (SrcVT.isVector()) {
20403 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20404 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20405 // source for strict FP.
20406 if (IsStrict)
20407 return DAG.getNode(
20408 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20409 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20410 DAG.getUNDEF(SrcVT))});
20411 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20412 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20413 DAG.getUNDEF(SrcVT)));
20414 }
20415 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20416 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20417
20418 return SDValue();
20419 }
20420
20421 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20422, __extension__ __PRETTY_FUNCTION__))
20422 "Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20422, __extension__ __PRETTY_FUNCTION__))
;
20423
20424 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20425
20426 // These are really Legal; return the operand so the caller accepts it as
20427 // Legal.
20428 if (SrcVT == MVT::i32 && UseSSEReg)
20429 return Op;
20430 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20431 return Op;
20432
20433 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20434 return V;
20435 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
20436 return V;
20437
20438 // SSE doesn't have an i16 conversion so we need to promote.
20439 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20440 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20441 if (IsStrict)
20442 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20443 {Chain, Ext});
20444
20445 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20446 }
20447
20448 if (VT == MVT::f128)
20449 return SDValue();
20450
20451 SDValue ValueToStore = Src;
20452 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20453 // Bitcasting to f64 here allows us to do a single 64-bit store from
20454 // an SSE register, avoiding the store forwarding penalty that would come
20455 // with two 32-bit stores.
20456 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20457
20458 unsigned Size = SrcVT.getStoreSize();
20459 Align Alignment(Size);
20460 MachineFunction &MF = DAG.getMachineFunction();
20461 auto PtrVT = getPointerTy(MF.getDataLayout());
20462 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20463 MachinePointerInfo MPI =
20464 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20465 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20466 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20467 std::pair<SDValue, SDValue> Tmp =
20468 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20469
20470 if (IsStrict)
20471 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20472
20473 return Tmp.first;
20474}
20475
20476std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20477 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20478 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20479 // Build the FILD
20480 SDVTList Tys;
20481 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20482 if (useSSE)
20483 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20484 else
20485 Tys = DAG.getVTList(DstVT, MVT::Other);
20486
20487 SDValue FILDOps[] = {Chain, Pointer};
20488 SDValue Result =
20489 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20490 Alignment, MachineMemOperand::MOLoad);
20491 Chain = Result.getValue(1);
20492
20493 if (useSSE) {
20494 MachineFunction &MF = DAG.getMachineFunction();
20495 unsigned SSFISize = DstVT.getStoreSize();
20496 int SSFI =
20497 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20498 auto PtrVT = getPointerTy(MF.getDataLayout());
20499 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20500 Tys = DAG.getVTList(MVT::Other);
20501 SDValue FSTOps[] = {Chain, Result, StackSlot};
20502 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
20503 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
20504 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20505
20506 Chain =
20507 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20508 Result = DAG.getLoad(
20509 DstVT, DL, Chain, StackSlot,
20510 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
20511 Chain = Result.getValue(1);
20512 }
20513
20514 return { Result, Chain };
20515}
20516
20517/// Horizontal vector math instructions may be slower than normal math with
20518/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20519/// implementation, and likely shuffle complexity of the alternate sequence.
20520static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20521 const X86Subtarget &Subtarget) {
20522 bool IsOptimizingSize = DAG.shouldOptForSize();
20523 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20524 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20525}
20526
20527/// 64-bit unsigned integer to double expansion.
20528static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
20529 const X86Subtarget &Subtarget) {
20530 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20531 // when converting 0 when rounding toward negative infinity. Caller will
20532 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20533 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20533, __extension__ __PRETTY_FUNCTION__))
;
20534 // This algorithm is not obvious. Here it is what we're trying to output:
20535 /*
20536 movq %rax, %xmm0
20537 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20538 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20539 #ifdef __SSE3__
20540 haddpd %xmm0, %xmm0
20541 #else
20542 pshufd $0x4e, %xmm0, %xmm1
20543 addpd %xmm1, %xmm0
20544 #endif
20545 */
20546
20547 SDLoc dl(Op);
20548 LLVMContext *Context = DAG.getContext();
20549
20550 // Build some magic constants.
20551 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20552 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20553 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20554 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20555
20556 SmallVector<Constant*,2> CV1;
20557 CV1.push_back(
20558 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20559 APInt(64, 0x4330000000000000ULL))));
20560 CV1.push_back(
20561 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20562 APInt(64, 0x4530000000000000ULL))));
20563 Constant *C1 = ConstantVector::get(CV1);
20564 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20565
20566 // Load the 64-bit value into an XMM register.
20567 SDValue XR1 =
20568 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20569 SDValue CLod0 = DAG.getLoad(
20570 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20571 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20572 SDValue Unpck1 =
20573 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20574
20575 SDValue CLod1 = DAG.getLoad(
20576 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20577 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));
20578 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20579 // TODO: Are there any fast-math-flags to propagate here?
20580 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20581 SDValue Result;
20582
20583 if (Subtarget.hasSSE3() &&
20584 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20585 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20586 } else {
20587 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20588 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20589 }
20590 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20591 DAG.getIntPtrConstant(0, dl));
20592 return Result;
20593}
20594
20595/// 32-bit unsigned integer to float expansion.
20596static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
20597 const X86Subtarget &Subtarget) {
20598 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20599 SDLoc dl(Op);
20600 // FP constant to bias correct the final result.
20601 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
20602 MVT::f64);
20603
20604 // Load the 32-bit value into an XMM register.
20605 SDValue Load =
20606 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20607
20608 // Zero out the upper parts of the register.
20609 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20610
20611 // Or the load with the bias.
20612 SDValue Or = DAG.getNode(
20613 ISD::OR, dl, MVT::v2i64,
20614 DAG.getBitcast(MVT::v2i64, Load),
20615 DAG.getBitcast(MVT::v2i64,
20616 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20617 Or =
20618 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20619 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
20620
20621 if (Op.getNode()->isStrictFPOpcode()) {
20622 // Subtract the bias.
20623 // TODO: Are there any fast-math-flags to propagate here?
20624 SDValue Chain = Op.getOperand(0);
20625 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20626 {Chain, Or, Bias});
20627
20628 if (Op.getValueType() == Sub.getValueType())
20629 return Sub;
20630
20631 // Handle final rounding.
20632 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20633 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20634
20635 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20636 }
20637
20638 // Subtract the bias.
20639 // TODO: Are there any fast-math-flags to propagate here?
20640 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20641
20642 // Handle final rounding.
20643 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20644}
20645
20646static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
20647 const X86Subtarget &Subtarget,
20648 const SDLoc &DL) {
20649 if (Op.getSimpleValueType() != MVT::v2f64)
20650 return SDValue();
20651
20652 bool IsStrict = Op->isStrictFPOpcode();
20653
20654 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20655 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20655, __extension__ __PRETTY_FUNCTION__))
;
20656
20657 if (Subtarget.hasAVX512()) {
20658 if (!Subtarget.hasVLX()) {
20659 // Let generic type legalization widen this.
20660 if (!IsStrict)
20661 return SDValue();
20662 // Otherwise pad the integer input with 0s and widen the operation.
20663 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20664 DAG.getConstant(0, DL, MVT::v2i32));
20665 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20666 {Op.getOperand(0), N0});
20667 SDValue Chain = Res.getValue(1);
20668 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20669 DAG.getIntPtrConstant(0, DL));
20670 return DAG.getMergeValues({Res, Chain}, DL);
20671 }
20672
20673 // Legalize to v4i32 type.
20674 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20675 DAG.getUNDEF(MVT::v2i32));
20676 if (IsStrict)
20677 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20678 {Op.getOperand(0), N0});
20679 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20680 }
20681
20682 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20683 // This gives us the floating point equivalent of 2^52 + the i32 integer
20684 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20685 // point leaving just our i32 integers in double format.
20686 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20687 SDValue VBias =
20688 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
20689 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20690 DAG.getBitcast(MVT::v2i64, VBias));
20691 Or = DAG.getBitcast(MVT::v2f64, Or);
20692
20693 if (IsStrict)
20694 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20695 {Op.getOperand(0), Or, VBias});
20696 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20697}
20698
20699static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
20700 const X86Subtarget &Subtarget) {
20701 SDLoc DL(Op);
20702 bool IsStrict = Op->isStrictFPOpcode();
20703 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20704 MVT VecIntVT = V.getSimpleValueType();
20705 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20706, __extension__ __PRETTY_FUNCTION__))
20706 "Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20706, __extension__ __PRETTY_FUNCTION__))
;
20707
20708 if (Subtarget.hasAVX512()) {
20709 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20710 assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20710, __extension__ __PRETTY_FUNCTION__))
;
20711 MVT VT = Op->getSimpleValueType(0);
20712
20713 // v8i32->v8f64 is legal with AVX512 so just return it.
20714 if (VT == MVT::v8f64)
20715 return Op;
20716
20717 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20718, __extension__ __PRETTY_FUNCTION__))
20718 "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20718, __extension__ __PRETTY_FUNCTION__))
;
20719 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20720 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20721 // Need to concat with zero vector for strict fp to avoid spurious
20722 // exceptions.
20723 SDValue Tmp =
20724 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20725 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20726 DAG.getIntPtrConstant(0, DL));
20727 SDValue Res, Chain;
20728 if (IsStrict) {
20729 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20730 {Op->getOperand(0), V});
20731 Chain = Res.getValue(1);
20732 } else {
20733 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20734 }
20735
20736 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20737 DAG.getIntPtrConstant(0, DL));
20738
20739 if (IsStrict)
20740 return DAG.getMergeValues({Res, Chain}, DL);
20741 return Res;
20742 }
20743
20744 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20745 Op->getSimpleValueType(0) == MVT::v4f64) {
20746 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20747 Constant *Bias = ConstantFP::get(
20748 *DAG.getContext(),
20749 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20750 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20751 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20752 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20753 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20754 SDValue VBias = DAG.getMemIntrinsicNode(
20755 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20756 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
20757 MachineMemOperand::MOLoad);
20758
20759 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20760 DAG.getBitcast(MVT::v4i64, VBias));
20761 Or = DAG.getBitcast(MVT::v4f64, Or);
20762
20763 if (IsStrict)
20764 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20765 {Op.getOperand(0), Or, VBias});
20766 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20767 }
20768
20769 // The algorithm is the following:
20770 // #ifdef __SSE4_1__
20771 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20772 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20773 // (uint4) 0x53000000, 0xaa);
20774 // #else
20775 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20776 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20777 // #endif
20778 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20779 // return (float4) lo + fhi;
20780
20781 bool Is128 = VecIntVT == MVT::v4i32;
20782 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20783 // If we convert to something else than the supported type, e.g., to v4f64,
20784 // abort early.
20785 if (VecFloatVT != Op->getSimpleValueType(0))
20786 return SDValue();
20787
20788 // In the #idef/#else code, we have in common:
20789 // - The vector of constants:
20790 // -- 0x4b000000
20791 // -- 0x53000000
20792 // - A shift:
20793 // -- v >> 16
20794
20795 // Create the splat vector for 0x4b000000.
20796 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20797 // Create the splat vector for 0x53000000.
20798 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20799
20800 // Create the right shift.
20801 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20802 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20803
20804 SDValue Low, High;
20805 if (Subtarget.hasSSE41()) {
20806 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20807 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20808 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20809 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20810 // Low will be bitcasted right away, so do not bother bitcasting back to its
20811 // original type.
20812 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20813 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20814 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20815 // (uint4) 0x53000000, 0xaa);
20816 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20817 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20818 // High will be bitcasted right away, so do not bother bitcasting back to
20819 // its original type.
20820 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20821 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20822 } else {
20823 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20824 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20825 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20826 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20827
20828 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20829 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20830 }
20831
20832 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20833 SDValue VecCstFSub = DAG.getConstantFP(
20834 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20835
20836 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20837 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20838 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20839 // enabled. See PR24512.
20840 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20841 // TODO: Are there any fast-math-flags to propagate here?
20842 // (float4) lo;
20843 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20844 // return (float4) lo + fhi;
20845 if (IsStrict) {
20846 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20847 {Op.getOperand(0), HighBitcast, VecCstFSub});
20848 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20849 {FHigh.getValue(1), LowBitcast, FHigh});
20850 }
20851
20852 SDValue FHigh =
20853 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20854 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20855}
20856
20857static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
20858 const X86Subtarget &Subtarget) {
20859 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20860 SDValue N0 = Op.getOperand(OpNo);
20861 MVT SrcVT = N0.getSimpleValueType();
20862 SDLoc dl(Op);
20863
20864 switch (SrcVT.SimpleTy) {
20865 default:
20866 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20866)
;
20867 case MVT::v2i32:
20868 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
20869 case MVT::v4i32:
20870 case MVT::v8i32:
20871 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
20872 case MVT::v2i64:
20873 case MVT::v4i64:
20874 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
20875 }
20876}
20877
20878SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20879 SelectionDAG &DAG) const {
20880 bool IsStrict = Op->isStrictFPOpcode();
20881 unsigned OpNo = IsStrict ? 1 : 0;
20882 SDValue Src = Op.getOperand(OpNo);
20883 SDLoc dl(Op);
20884 auto PtrVT = getPointerTy(DAG.getDataLayout());
20885 MVT SrcVT = Src.getSimpleValueType();
20886 MVT DstVT = Op->getSimpleValueType(0);
20887 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20888
20889 if (DstVT == MVT::f128)
20890 return SDValue();
20891
20892 if (DstVT.isVector())
20893 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
20894
20895 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
20896 return Extract;
20897
20898 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20899 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20900 // Conversions from unsigned i32 to f32/f64 are legal,
20901 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20902 return Op;
20903 }
20904
20905 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20906 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20907 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20908 if (IsStrict)
20909 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20910 {Chain, Src});
20911 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20912 }
20913
20914 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
20915 return V;
20916 if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
20917 return V;
20918
20919 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20920 // infinity. It produces -0.0, so disable under strictfp.
20921 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64 && !IsStrict)
20922 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
20923 if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
20924 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
20925 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20926 (DstVT == MVT::f32 || DstVT == MVT::f64))
20927 return SDValue();
20928
20929 // Make a 64-bit buffer, and use it to build an FILD.
20930 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20931 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20932 Align SlotAlign(8);
20933 MachinePointerInfo MPI =
20934 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
20935 if (SrcVT == MVT::i32) {
20936 SDValue OffsetSlot =
20937 DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
20938 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20939 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20940 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20941 std::pair<SDValue, SDValue> Tmp =
20942 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20943 if (IsStrict)
20944 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20945
20946 return Tmp.first;
20947 }
20948
20949 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20949, __extension__ __PRETTY_FUNCTION__))
;
20950 SDValue ValueToStore = Src;
20951 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20952 // Bitcasting to f64 here allows us to do a single 64-bit store from
20953 // an SSE register, avoiding the store forwarding penalty that would come
20954 // with two 32-bit stores.
20955 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20956 }
20957 SDValue Store =
20958 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20959 // For i64 source, we need to add the appropriate power of 2 if the input
20960 // was negative. We must be careful to do the computation in x87 extended
20961 // precision, not in SSE.
20962 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20963 SDValue Ops[] = { Store, StackSlot };
20964 SDValue Fild =
20965 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20966 SlotAlign, MachineMemOperand::MOLoad);
20967 Chain = Fild.getValue(1);
20968
20969
20970 // Check whether the sign bit is set.
20971 SDValue SignSet = DAG.getSetCC(
20972 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20973 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20974
20975 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20976 APInt FF(64, 0x5F80000000000000ULL);
20977 SDValue FudgePtr = DAG.getConstantPool(
20978 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20979 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20980
20981 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20982 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20983 SDValue Four = DAG.getIntPtrConstant(4, dl);
20984 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20985 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20986
20987 // Load the value out, extending it from f32 to f80.
20988 SDValue Fudge = DAG.getExtLoad(
20989 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20990 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
20991 CPAlignment);
20992 Chain = Fudge.getValue(1);
20993 // Extend everything to 80 bits to force it to be done on x87.
20994 // TODO: Are there any fast-math-flags to propagate here?
20995 if (IsStrict) {
20996 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
20997 {Chain, Fild, Fudge});
20998 // STRICT_FP_ROUND can't handle equal types.
20999 if (DstVT == MVT::f80)
21000 return Add;
21001 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
21002 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
21003 }
21004 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
21005 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
21006 DAG.getIntPtrConstant(0, dl));
21007}
21008
21009// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
21010// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
21011// just return an SDValue().
21012// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
21013// to i16, i32 or i64, and we lower it to a legal sequence and return the
21014// result.
21015SDValue
21016X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
21017 bool IsSigned, SDValue &Chain) const {
21018 bool IsStrict = Op->isStrictFPOpcode();
21019 SDLoc DL(Op);
21020
21021 EVT DstTy = Op.getValueType();
21022 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
21023 EVT TheVT = Value.getValueType();
21024 auto PtrVT = getPointerTy(DAG.getDataLayout());
21025
21026 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21027 // f16 must be promoted before using the lowering in this routine.
21028 // fp128 does not use this lowering.
21029 return SDValue();
21030 }
21031
21032 // If using FIST to compute an unsigned i64, we'll need some fixup
21033 // to handle values above the maximum signed i64. A FIST is always
21034 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
21035 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21036
21037 // FIXME: This does not generate an invalid exception if the input does not
21038 // fit in i32. PR44019
21039 if (!IsSigned && DstTy != MVT::i64) {
21040 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
21041 // The low 32 bits of the fist result will have the correct uint32 result.
21042 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21042, __extension__ __PRETTY_FUNCTION__))
;
21043 DstTy = MVT::i64;
21044 }
21045
21046 assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21048, __extension__ __PRETTY_FUNCTION__))
21047 DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21048, __extension__ __PRETTY_FUNCTION__))
21048 "Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21048, __extension__ __PRETTY_FUNCTION__))
;
21049
21050 // We lower FP->int64 into FISTP64 followed by a load from a temporary
21051 // stack slot.
21052 MachineFunction &MF = DAG.getMachineFunction();
21053 unsigned MemSize = DstTy.getStoreSize();
21054 int SSFI =
21055 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
21056 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
21057
21058 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21059
21060 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
21061
21062 if (UnsignedFixup) {
21063 //
21064 // Conversion to unsigned i64 is implemented with a select,
21065 // depending on whether the source value fits in the range
21066 // of a signed i64. Let Thresh be the FP equivalent of
21067 // 0x8000000000000000ULL.
21068 //
21069 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
21070 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
21071 // FistSrc = (Value - FltOfs);
21072 // Fist-to-mem64 FistSrc
21073 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
21074 // to XOR'ing the high 32 bits with Adjust.
21075 //
21076 // Being a power of 2, Thresh is exactly representable in all FP formats.
21077 // For X87 we'd like to use the smallest FP type for this constant, but
21078 // for DAG type consistency we have to match the FP operand type.
21079
21080 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
21081 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
21082 bool LosesInfo = false;
21083 if (TheVT == MVT::f64)
21084 // The rounding mode is irrelevant as the conversion should be exact.
21085 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
21086 &LosesInfo);
21087 else if (TheVT == MVT::f80)
21088 Status = Thresh.convert(APFloat::x87DoubleExtended(),
21089 APFloat::rmNearestTiesToEven, &LosesInfo);
21090
21091 assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21092, __extension__ __PRETTY_FUNCTION__))
21092 "FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21092, __extension__ __PRETTY_FUNCTION__))
;
21093
21094 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
21095
21096 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
21097 *DAG.getContext(), TheVT);
21098 SDValue Cmp;
21099 if (IsStrict) {
21100 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
21101 /*IsSignaling*/ true);
21102 Chain = Cmp.getValue(1);
21103 } else {
21104 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
21105 }
21106
21107 // Our preferred lowering of
21108 //
21109 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
21110 //
21111 // is
21112 //
21113 // (Value >= Thresh) << 63
21114 //
21115 // but since we can get here after LegalOperations, DAGCombine might do the
21116 // wrong thing if we create a select. So, directly create the preferred
21117 // version.
21118 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
21119 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
21120 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
21121
21122 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
21123 DAG.getConstantFP(0.0, DL, TheVT));
21124
21125 if (IsStrict) {
21126 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
21127 { Chain, Value, FltOfs });
21128 Chain = Value.getValue(1);
21129 } else
21130 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
21131 }
21132
21133 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
21134
21135 // FIXME This causes a redundant load/store if the SSE-class value is already
21136 // in memory, such as if it is on the callstack.
21137 if (isScalarFPTypeInSSEReg(TheVT)) {
21138 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21138, __extension__ __PRETTY_FUNCTION__))
;
21139 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
21140 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21141 SDValue Ops[] = { Chain, StackSlot };
21142
21143 unsigned FLDSize = TheVT.getStoreSize();
21144 assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21144, __extension__ __PRETTY_FUNCTION__))
;
21145 MachineMemOperand *MMO = MF.getMachineMemOperand(
21146 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
21147 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
21148 Chain = Value.getValue(1);
21149 }
21150
21151 // Build the FP_TO_INT*_IN_MEM
21152 MachineMemOperand *MMO = MF.getMachineMemOperand(
21153 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
21154 SDValue Ops[] = { Chain, Value, StackSlot };
21155 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
21156 DAG.getVTList(MVT::Other),
21157 Ops, DstTy, MMO);
21158
21159 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
21160 Chain = Res.getValue(1);
21161
21162 // If we need an unsigned fixup, XOR the result with adjust.
21163 if (UnsignedFixup)
21164 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
21165
21166 return Res;
21167}
21168
21169static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
21170 const X86Subtarget &Subtarget) {
21171 MVT VT = Op.getSimpleValueType();
21172 SDValue In = Op.getOperand(0);
21173 MVT InVT = In.getSimpleValueType();
21174 SDLoc dl(Op);
21175 unsigned Opc = Op.getOpcode();
21176
21177 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21177, __extension__ __PRETTY_FUNCTION__))
;
21178 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21179, __extension__ __PRETTY_FUNCTION__))
21179 "Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21179, __extension__ __PRETTY_FUNCTION__))
;
21180 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21181, __extension__ __PRETTY_FUNCTION__))
21181 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21181, __extension__ __PRETTY_FUNCTION__))
;
21182 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21185, __extension__ __PRETTY_FUNCTION__))
21183 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21185, __extension__ __PRETTY_FUNCTION__))
21184 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21185, __extension__ __PRETTY_FUNCTION__))
21185 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21185, __extension__ __PRETTY_FUNCTION__))
;
21186 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21189, __extension__ __PRETTY_FUNCTION__))
21187 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21189, __extension__ __PRETTY_FUNCTION__))
21188 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21189, __extension__ __PRETTY_FUNCTION__))
21189 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21189, __extension__ __PRETTY_FUNCTION__))
;
21190
21191 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
21192
21193 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
21194 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21194, __extension__ __PRETTY_FUNCTION__))
;
21195 return splitVectorIntUnary(Op, DAG);
21196 }
21197
21198 if (Subtarget.hasInt256())
21199 return Op;
21200
21201 // Optimize vectors in AVX mode:
21202 //
21203 // v8i16 -> v8i32
21204 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
21205 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
21206 // Concat upper and lower parts.
21207 //
21208 // v4i32 -> v4i64
21209 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
21210 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
21211 // Concat upper and lower parts.
21212 //
21213 MVT HalfVT = VT.getHalfNumVectorElementsVT();
21214 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
21215
21216 // Short-circuit if we can determine that each 128-bit half is the same value.
21217 // Otherwise, this is difficult to match and optimize.
21218 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
21219 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
21220 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
21221
21222 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
21223 SDValue Undef = DAG.getUNDEF(InVT);
21224 bool NeedZero = Opc == ISD::ZERO_EXTEND;
21225 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
21226 OpHi = DAG.getBitcast(HalfVT, OpHi);
21227
21228 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
21229}
21230
21231// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
21232static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
21233 const SDLoc &dl, SelectionDAG &DAG) {
21234 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21234, __extension__ __PRETTY_FUNCTION__))
;
21235 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21236 DAG.getIntPtrConstant(0, dl));
21237 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
21238 DAG.getIntPtrConstant(8, dl));
21239 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
21240 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
21241 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
21242 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21243}
21244
21245static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
21246 const X86Subtarget &Subtarget,
21247 SelectionDAG &DAG) {
21248 MVT VT = Op->getSimpleValueType(0);
21249 SDValue In = Op->getOperand(0);
21250 MVT InVT = In.getSimpleValueType();
21251 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21251, __extension__ __PRETTY_FUNCTION__))
;
21252 SDLoc DL(Op);
21253 unsigned NumElts = VT.getVectorNumElements();
21254
21255 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
21256 // avoids a constant pool load.
21257 if (VT.getVectorElementType() != MVT::i8) {
21258 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
21259 return DAG.getNode(ISD::SRL, DL, VT, Extend,
21260 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
21261 }
21262
21263 // Extend VT if BWI is not supported.
21264 MVT ExtVT = VT;
21265 if (!Subtarget.hasBWI()) {
21266 // If v16i32 is to be avoided, we'll need to split and concatenate.
21267 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21268 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21269
21270 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21271 }
21272
21273 // Widen to 512-bits if VLX is not supported.
21274 MVT WideVT = ExtVT;
21275 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21276 NumElts *= 512 / ExtVT.getSizeInBits();
21277 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21278 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
21279 In, DAG.getIntPtrConstant(0, DL));
21280 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
21281 NumElts);
21282 }
21283
21284 SDValue One = DAG.getConstant(1, DL, WideVT);
21285 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21286
21287 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21288
21289 // Truncate if we had to extend above.
21290 if (VT != ExtVT) {
21291 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21292 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21293 }
21294
21295 // Extract back to 128/256-bit if we widened.
21296 if (WideVT != VT)
21297 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21298 DAG.getIntPtrConstant(0, DL));
21299
21300 return SelectedVal;
21301}
21302
21303static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
21304 SelectionDAG &DAG) {
21305 SDValue In = Op.getOperand(0);
21306 MVT SVT = In.getSimpleValueType();
21307
21308 if (SVT.getVectorElementType() == MVT::i1)
21309 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
21310
21311 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21311, __extension__ __PRETTY_FUNCTION__))
;
21312 return LowerAVXExtend(Op, DAG, Subtarget);
21313}
21314
21315/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21316/// It makes use of the fact that vectors with enough leading sign/zero bits
21317/// prevent the PACKSS/PACKUS from saturating the results.
21318/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21319/// within each 128-bit lane.
21320static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21321 const SDLoc &DL, SelectionDAG &DAG,
21322 const X86Subtarget &Subtarget) {
21323 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21324, __extension__ __PRETTY_FUNCTION__))
21324 "Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21324, __extension__ __PRETTY_FUNCTION__))
;
21325 assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21325, __extension__ __PRETTY_FUNCTION__))
;
21326
21327 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21328 if (!Subtarget.hasSSE2())
21329 return SDValue();
21330
21331 EVT SrcVT = In.getValueType();
21332
21333 // No truncation required, we might get here due to recursive calls.
21334 if (SrcVT == DstVT)
21335 return In;
21336
21337 // We only support vector truncation to 64bits or greater from a
21338 // 128bits or greater source.
21339 unsigned DstSizeInBits = DstVT.getSizeInBits();
21340 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21341 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
21342 return SDValue();
21343
21344 unsigned NumElems = SrcVT.getVectorNumElements();
21345 if (!isPowerOf2_32(NumElems))
21346 return SDValue();
21347
21348 LLVMContext &Ctx = *DAG.getContext();
21349 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21349, __extension__ __PRETTY_FUNCTION__))
;
21350 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21350, __extension__ __PRETTY_FUNCTION__))
;
21351
21352 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21353
21354 // Pack to the largest type possible:
21355 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21356 EVT InVT = MVT::i16, OutVT = MVT::i8;
21357 if (SrcVT.getScalarSizeInBits() > 16 &&
21358 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21359 InVT = MVT::i32;
21360 OutVT = MVT::i16;
21361 }
21362
21363 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
21364 if (SrcVT.is128BitVector()) {
21365 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21366 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21367 In = DAG.getBitcast(InVT, In);
21368 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
21369 Res = extractSubVector(Res, 0, DAG, DL, 64);
21370 return DAG.getBitcast(DstVT, Res);
21371 }
21372
21373 // Split lower/upper subvectors.
21374 SDValue Lo, Hi;
21375 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21376
21377 unsigned SubSizeInBits = SrcSizeInBits / 2;
21378 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21379 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21380
21381 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21382 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21383 Lo = DAG.getBitcast(InVT, Lo);
21384 Hi = DAG.getBitcast(InVT, Hi);
21385 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21386 return DAG.getBitcast(DstVT, Res);
21387 }
21388
21389 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21390 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21391 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21392 Lo = DAG.getBitcast(InVT, Lo);
21393 Hi = DAG.getBitcast(InVT, Hi);
21394 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21395
21396 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21397 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21398 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21399 SmallVector<int, 64> Mask;
21400 int Scale = 64 / OutVT.getScalarSizeInBits();
21401 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21402 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21403
21404 if (DstVT.is256BitVector())
21405 return DAG.getBitcast(DstVT, Res);
21406
21407 // If 512bit -> 128bit truncate another stage.
21408 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21409 Res = DAG.getBitcast(PackedVT, Res);
21410 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21411 }
21412
21413 // Recursively pack lower/upper subvectors, concat result and pack again.
21414 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21414, __extension__ __PRETTY_FUNCTION__))
;
21415 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21416 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
21417 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
21418
21419 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21420 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21421 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21422}
21423
21424static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
21425 const X86Subtarget &Subtarget) {
21426
21427 SDLoc DL(Op);
21428 MVT VT = Op.getSimpleValueType();
21429 SDValue In = Op.getOperand(0);
21430 MVT InVT = In.getSimpleValueType();
21431
21432 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21432, __extension__ __PRETTY_FUNCTION__))
;
21433
21434 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21435 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21436 if (InVT.getScalarSizeInBits() <= 16) {
21437 if (Subtarget.hasBWI()) {
21438 // legal, will go to VPMOVB2M, VPMOVW2M
21439 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21440 // We need to shift to get the lsb into sign position.
21441 // Shift packed bytes not supported natively, bitcast to word
21442 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21443 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21444 DAG.getBitcast(ExtVT, In),
21445 DAG.getConstant(ShiftInx, DL, ExtVT));
21446 In = DAG.getBitcast(InVT, In);
21447 }
21448 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21449 In, ISD::SETGT);
21450 }
21451 // Use TESTD/Q, extended vector to packed dword/qword.
21452 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21453, __extension__ __PRETTY_FUNCTION__))
21453 "Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21453, __extension__ __PRETTY_FUNCTION__))
;
21454 unsigned NumElts = InVT.getVectorNumElements();
21455 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21455, __extension__ __PRETTY_FUNCTION__))
;
21456 // We need to change to a wider element type that we have support for.
21457 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21458 // For 16 element vectors we extend to v16i32 unless we are explicitly
21459 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21460 // we need to split into two 8 element vectors which we can extend to v8i32,
21461 // truncate and concat the results. There's an additional complication if
21462 // the original type is v16i8. In that case we can't split the v16i8
21463 // directly, so we need to shuffle high elements to low and use
21464 // sign_extend_vector_inreg.
21465 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21466 SDValue Lo, Hi;
21467 if (InVT == MVT::v16i8) {
21468 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21469 Hi = DAG.getVectorShuffle(
21470 InVT, DL, In, In,
21471 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21472 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21473 } else {
21474 assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21474, __extension__ __PRETTY_FUNCTION__))
;
21475 Lo = extract128BitVector(In, 0, DAG, DL);
21476 Hi = extract128BitVector(In, 8, DAG, DL);
21477 }
21478 // We're split now, just emit two truncates and a concat. The two
21479 // truncates will trigger legalization to come back to this function.
21480 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21481 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21482 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21483 }
21484 // We either have 8 elements or we're allowed to use 512-bit vectors.
21485 // If we have VLX, we want to use the narrowest vector that can get the
21486 // job done so we use vXi32.
21487 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21488 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21489 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21490 InVT = ExtVT;
21491 ShiftInx = InVT.getScalarSizeInBits() - 1;
21492 }
21493
21494 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21495 // We need to shift to get the lsb into sign position.
21496 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21497 DAG.getConstant(ShiftInx, DL, InVT));
21498 }
21499 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21500 if (Subtarget.hasDQI())
21501 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21502 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21503}
21504
21505SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21506 SDLoc DL(Op);
21507 MVT VT = Op.getSimpleValueType();
21508 SDValue In = Op.getOperand(0);
21509 MVT InVT = In.getSimpleValueType();
21510 unsigned InNumEltBits = InVT.getScalarSizeInBits();
21511
21512 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21513, __extension__ __PRETTY_FUNCTION__))
21513 "Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21513, __extension__ __PRETTY_FUNCTION__))
;
21514
21515 // If we're called by the type legalizer, handle a few cases.
21516 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21517 if (!TLI.isTypeLegal(InVT)) {
21518 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21519 VT.is128BitVector()) {
21520 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21521, __extension__ __PRETTY_FUNCTION__))
21521 "Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21521, __extension__ __PRETTY_FUNCTION__))
;
21522 // The default behavior is to truncate one step, concatenate, and then
21523 // truncate the remainder. We'd rather produce two 64-bit results and
21524 // concatenate those.
21525 SDValue Lo, Hi;
21526 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21527
21528 EVT LoVT, HiVT;
21529 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21530
21531 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21532 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21533 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21534 }
21535
21536 // Otherwise let default legalization handle it.
21537 return SDValue();
21538 }
21539
21540 if (VT.getVectorElementType() == MVT::i1)
21541 return LowerTruncateVecI1(Op, DAG, Subtarget);
21542
21543 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21544 if (Subtarget.hasAVX512()) {
21545 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21546 assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21546, __extension__ __PRETTY_FUNCTION__))
;
21547 return splitVectorIntUnary(Op, DAG);
21548 }
21549
21550 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21551 // and then truncate that. But we should only do that if we haven't been
21552 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21553 // handled by isel patterns.
21554 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21555 Subtarget.canExtendTo512DQ())
21556 return Op;
21557 }
21558
21559 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
21560 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21561
21562 // Truncate with PACKUS if we are truncating a vector with leading zero bits
21563 // that extend all the way to the packed/truncated value.
21564 // Pre-SSE41 we can only use PACKUSWB.
21565 KnownBits Known = DAG.computeKnownBits(In);
21566 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
21567 if (SDValue V =
21568 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
21569 return V;
21570
21571 // Truncate with PACKSS if we are truncating a vector with sign-bits that
21572 // extend all the way to the packed/truncated value.
21573 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
21574 if (SDValue V =
21575 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
21576 return V;
21577
21578 // Handle truncation of V256 to V128 using shuffles.
21579 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21579, __extension__ __PRETTY_FUNCTION__))
;
21580
21581 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21582 In = DAG.getBitcast(MVT::v8i32, In);
21583
21584 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21585 if (Subtarget.hasInt256()) {
21586 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21587 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21588 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21589 DAG.getIntPtrConstant(0, DL));
21590 }
21591
21592 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21593 DAG.getIntPtrConstant(0, DL));
21594 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
21595 DAG.getIntPtrConstant(4, DL));
21596 static const int ShufMask[] = {0, 2, 4, 6};
21597 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
21598 }
21599
21600 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21601 In = DAG.getBitcast(MVT::v32i8, In);
21602
21603 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21604 if (Subtarget.hasInt256()) {
21605 // The PSHUFB mask:
21606 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21607 -1, -1, -1, -1, -1, -1, -1, -1,
21608 16, 17, 20, 21, 24, 25, 28, 29,
21609 -1, -1, -1, -1, -1, -1, -1, -1 };
21610 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21611 In = DAG.getBitcast(MVT::v4i64, In);
21612
21613 static const int ShufMask2[] = {0, 2, -1, -1};
21614 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21615 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16,
21616 DAG.getBitcast(MVT::v16i16, In),
21617 DAG.getIntPtrConstant(0, DL));
21618 }
21619
21620 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21621 DAG.getIntPtrConstant(0, DL));
21622 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, In,
21623 DAG.getIntPtrConstant(16, DL));
21624
21625 // The PSHUFB mask:
21626 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
21627 -1, -1, -1, -1, -1, -1, -1, -1};
21628
21629 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
21630 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
21631
21632 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
21633 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
21634
21635 // The MOVLHPS Mask:
21636 static const int ShufMask2[] = {0, 1, 4, 5};
21637 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
21638 return DAG.getBitcast(MVT::v8i16, res);
21639 }
21640
21641 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
21642 // Use an AND to zero uppper bits for PACKUS.
21643 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
21644
21645 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21646 DAG.getIntPtrConstant(0, DL));
21647 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
21648 DAG.getIntPtrConstant(8, DL));
21649 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
21650 }
21651
21652 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21652)
;
21653}
21654
21655// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21656// behaves on out of range inputs to generate optimized conversions.
21657static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,
21658 SelectionDAG &DAG,
21659 const X86Subtarget &Subtarget) {
21660 MVT SrcVT = Src.getSimpleValueType();
21661 unsigned DstBits = VT.getScalarSizeInBits();
21662 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21662, __extension__ __PRETTY_FUNCTION__))
;
21663
21664 // Calculate the converted result for values in the range 0 to
21665 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21666 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21667 SDValue Big =
21668 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21669 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21670 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21671
21672 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21673 // and only if the value was out of range. So we can use that
21674 // as our indicator that we rather use "Big" instead of "Small".
21675 //
21676 // Use "Small" if "IsOverflown" has all bits cleared
21677 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21678
21679 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21680 // use the slightly slower blendv select instead.
21681 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21682 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21683 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21684 }
21685
21686 SDValue IsOverflown =
21687 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21688 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21689 return DAG.getNode(ISD::OR, dl, VT, Small,
21690 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21691}
21692
21693SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21694 bool IsStrict = Op->isStrictFPOpcode();
21695 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21696 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21697 MVT VT = Op->getSimpleValueType(0);
21698 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21699 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21700 MVT SrcVT = Src.getSimpleValueType();
21701 SDLoc dl(Op);
21702
21703 SDValue Res;
21704 if (VT.isVector()) {
21705 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21706 MVT ResVT = MVT::v4i32;
21707 MVT TruncVT = MVT::v4i1;
21708 unsigned Opc;
21709 if (IsStrict)
21710 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
21711 else
21712 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21713
21714 if (!IsSigned && !Subtarget.hasVLX()) {
21715 assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21715, __extension__ __PRETTY_FUNCTION__))
;
21716 // Widen to 512-bits.
21717 ResVT = MVT::v8i32;
21718 TruncVT = MVT::v8i1;
21719 Opc = Op.getOpcode();
21720 // Need to concat with zero vector for strict fp to avoid spurious
21721 // exceptions.
21722 // TODO: Should we just do this for non-strict as well?
21723 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21724 : DAG.getUNDEF(MVT::v8f64);
21725 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21726 DAG.getIntPtrConstant(0, dl));
21727 }
21728 if (IsStrict) {
21729 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21730 Chain = Res.getValue(1);
21731 } else {
21732 Res = DAG.getNode(Opc, dl, ResVT, Src);
21733 }
21734
21735 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21736 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21737 DAG.getIntPtrConstant(0, dl));
21738 if (IsStrict)
21739 return DAG.getMergeValues({Res, Chain}, dl);
21740 return Res;
21741 }
21742
21743 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21744 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
21745 return Op;
21746
21747 MVT ResVT = VT;
21748 MVT EleVT = VT.getVectorElementType();
21749 if (EleVT != MVT::i64)
21750 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21751
21752 if (SrcVT != MVT::v8f16) {
21753 SDValue Tmp =
21754 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21755 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21756 Ops[0] = Src;
21757 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21758 }
21759
21760 if (IsStrict) {
21761 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21762 : X86ISD::STRICT_CVTTP2UI,
21763 dl, {ResVT, MVT::Other}, {Chain, Src});
21764 Chain = Res.getValue(1);
21765 } else {
21766 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21767 ResVT, Src);
21768 }
21769
21770 // TODO: Need to add exception check code for strict FP.
21771 if (EleVT.getSizeInBits() < 16) {
21772 ResVT = MVT::getVectorVT(EleVT, 8);
21773 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21774 }
21775
21776 if (ResVT != VT)
21777 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21778 DAG.getIntPtrConstant(0, dl));
21779
21780 if (IsStrict)
21781 return DAG.getMergeValues({Res, Chain}, dl);
21782 return Res;
21783 }
21784
21785 if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) {
21786 if (IsStrict) {
21787 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21788 : ISD::STRICT_FP_TO_UINT,
21789 dl, {MVT::v8i32, MVT::Other}, {Chain, Src});
21790 Chain = Res.getValue(1);
21791 } else {
21792 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21793 MVT::v8i32, Src);
21794 }
21795
21796 // TODO: Need to add exception check code for strict FP.
21797 Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res);
21798
21799 if (IsStrict)
21800 return DAG.getMergeValues({Res, Chain}, dl);
21801 return Res;
21802 }
21803
21804 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21805 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21806 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21806, __extension__ __PRETTY_FUNCTION__))
;
21807 assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21807, __extension__ __PRETTY_FUNCTION__))
;
21808 return Op;
21809 }
21810
21811 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21812 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21813 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21814 Subtarget.useAVX512Regs()) {
21815 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21815, __extension__ __PRETTY_FUNCTION__))
;
21816 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21816, __extension__ __PRETTY_FUNCTION__))
;
21817 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21818 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21819 // Need to concat with zero vector for strict fp to avoid spurious
21820 // exceptions.
21821 // TODO: Should we just do this for non-strict as well?
21822 SDValue Tmp =
21823 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21824 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21825 DAG.getIntPtrConstant(0, dl));
21826
21827 if (IsStrict) {
21828 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21829 {Chain, Src});
21830 Chain = Res.getValue(1);
21831 } else {
21832 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21833 }
21834
21835 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21836 DAG.getIntPtrConstant(0, dl));
21837
21838 if (IsStrict)
21839 return DAG.getMergeValues({Res, Chain}, dl);
21840 return Res;
21841 }
21842
21843 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21844 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21845 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21846 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21847 assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21847, __extension__ __PRETTY_FUNCTION__))
;
21848 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21849 // Need to concat with zero vector for strict fp to avoid spurious
21850 // exceptions.
21851 // TODO: Should we just do this for non-strict as well?
21852 SDValue Tmp =
21853 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21854 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21855 DAG.getIntPtrConstant(0, dl));
21856
21857 if (IsStrict) {
21858 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21859 {Chain, Src});
21860 Chain = Res.getValue(1);
21861 } else {
21862 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21863 }
21864
21865 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21866 DAG.getIntPtrConstant(0, dl));
21867
21868 if (IsStrict)
21869 return DAG.getMergeValues({Res, Chain}, dl);
21870 return Res;
21871 }
21872
21873 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21874 if (!Subtarget.hasVLX()) {
21875 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21876 // legalizer and then widened again by vector op legalization.
21877 if (!IsStrict)
21878 return SDValue();
21879
21880 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21881 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21882 {Src, Zero, Zero, Zero});
21883 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21884 {Chain, Tmp});
21885 SDValue Chain = Tmp.getValue(1);
21886 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21887 DAG.getIntPtrConstant(0, dl));
21888 return DAG.getMergeValues({Tmp, Chain}, dl);
21889 }
21890
21891 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21891, __extension__ __PRETTY_FUNCTION__))
;
21892 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21893 DAG.getUNDEF(MVT::v2f32));
21894 if (IsStrict) {
21895 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21896 : X86ISD::STRICT_CVTTP2UI;
21897 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21898 }
21899 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21900 return DAG.getNode(Opc, dl, VT, Tmp);
21901 }
21902
21903 // Generate optimized instructions for pre AVX512 unsigned conversions from
21904 // vXf32 to vXi32.
21905 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21906 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21907 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21908 assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21908, __extension__ __PRETTY_FUNCTION__))
;
21909 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21910 }
21911
21912 return SDValue();
21913 }
21914
21915 assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21915, __extension__ __PRETTY_FUNCTION__))
;
21916
21917 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21918
21919 if (!IsSigned && UseSSEReg) {
21920 // Conversions from f32/f64 with AVX512 should be legal.
21921 if (Subtarget.hasAVX512())
21922 return Op;
21923
21924 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21925 // behaves on out of range inputs to generate optimized conversions.
21926 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21927 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21928 unsigned DstBits = VT.getScalarSizeInBits();
21929 APInt UIntLimit = APInt::getSignMask(DstBits);
21930 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21931 DAG.getConstant(UIntLimit, dl, VT));
21932 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21933
21934 // Calculate the converted result for values in the range:
21935 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21936 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21937 SDValue Small =
21938 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21939 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21940 SDValue Big = DAG.getNode(
21941 X86ISD::CVTTS2SI, dl, VT,
21942 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21943 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21944
21945 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21946 // and only if the value was out of range. So we can use that
21947 // as our indicator that we rather use "Big" instead of "Small".
21948 //
21949 // Use "Small" if "IsOverflown" has all bits cleared
21950 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21951 SDValue IsOverflown = DAG.getNode(
21952 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21953 return DAG.getNode(ISD::OR, dl, VT, Small,
21954 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21955 }
21956
21957 // Use default expansion for i64.
21958 if (VT == MVT::i64)
21959 return SDValue();
21960
21961 assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21961, __extension__ __PRETTY_FUNCTION__))
;
21962
21963 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21964 // FIXME: This does not generate an invalid exception if the input does not
21965 // fit in i32. PR44019
21966 if (Subtarget.is64Bit()) {
21967 if (IsStrict) {
21968 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21969 {Chain, Src});
21970 Chain = Res.getValue(1);
21971 } else
21972 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21973
21974 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21975 if (IsStrict)
21976 return DAG.getMergeValues({Res, Chain}, dl);
21977 return Res;
21978 }
21979
21980 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21981 // use fisttp which will be handled later.
21982 if (!Subtarget.hasSSE3())
21983 return SDValue();
21984 }
21985
21986 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21987 // FIXME: This does not generate an invalid exception if the input does not
21988 // fit in i16. PR44019
21989 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21990 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21990, __extension__ __PRETTY_FUNCTION__))
;
21991 if (IsStrict) {
21992 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21993 {Chain, Src});
21994 Chain = Res.getValue(1);
21995 } else
21996 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21997
21998 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21999 if (IsStrict)
22000 return DAG.getMergeValues({Res, Chain}, dl);
22001 return Res;
22002 }
22003
22004 // If this is a FP_TO_SINT using SSEReg we're done.
22005 if (UseSSEReg && IsSigned)
22006 return Op;
22007
22008 // fp128 needs to use a libcall.
22009 if (SrcVT == MVT::f128) {
22010 RTLIB::Libcall LC;
22011 if (IsSigned)
22012 LC = RTLIB::getFPTOSINT(SrcVT, VT);
22013 else
22014 LC = RTLIB::getFPTOUINT(SrcVT, VT);
22015
22016 MakeLibCallOptions CallOptions;
22017 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
22018 SDLoc(Op), Chain);
22019
22020 if (IsStrict)
22021 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
22022
22023 return Tmp.first;
22024 }
22025
22026 // Fall back to X87.
22027 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
22028 if (IsStrict)
22029 return DAG.getMergeValues({V, Chain}, dl);
22030 return V;
22031 }
22032
22033 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22033)
;
22034}
22035
22036SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
22037 SelectionDAG &DAG) const {
22038 SDValue Src = Op.getOperand(0);
22039 MVT SrcVT = Src.getSimpleValueType();
22040
22041 // If the source is in an SSE register, the node is Legal.
22042 if (isScalarFPTypeInSSEReg(SrcVT))
22043 return Op;
22044
22045 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22046}
22047
22048SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22049 SelectionDAG &DAG) const {
22050 EVT DstVT = N->getValueType(0);
22051 SDValue Src = N->getOperand(0);
22052 EVT SrcVT = Src.getValueType();
22053
22054 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22055 // f16 must be promoted before using the lowering in this routine.
22056 // fp128 does not use this lowering.
22057 return SDValue();
22058 }
22059
22060 SDLoc DL(N);
22061 SDValue Chain = DAG.getEntryNode();
22062
22063 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22064
22065 // If we're converting from SSE, the stack slot needs to hold both types.
22066 // Otherwise it only needs to hold the DstVT.
22067 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22068 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22069 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22070 MachinePointerInfo MPI =
22071 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
22072
22073 if (UseSSE) {
22074 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22074, __extension__ __PRETTY_FUNCTION__))
;
22075 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22076 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22077 SDValue Ops[] = { Chain, StackPtr };
22078
22079 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22080 /*Align*/ None, MachineMemOperand::MOLoad);
22081 Chain = Src.getValue(1);
22082 }
22083
22084 SDValue StoreOps[] = { Chain, Src, StackPtr };
22085 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22086 StoreOps, DstVT, MPI, /*Align*/ None,
22087 MachineMemOperand::MOStore);
22088
22089 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22090}
22091
22092SDValue
22093X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22094 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22095 // but making use of X86 specifics to produce better instruction sequences.
22096 SDNode *Node = Op.getNode();
22097 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22098 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22099 SDLoc dl(SDValue(Node, 0));
22100 SDValue Src = Node->getOperand(0);
22101
22102 // There are three types involved here: SrcVT is the source floating point
22103 // type, DstVT is the type of the result, and TmpVT is the result of the
22104 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22105 // DstVT).
22106 EVT SrcVT = Src.getValueType();
22107 EVT DstVT = Node->getValueType(0);
22108 EVT TmpVT = DstVT;
22109
22110 // This code is only for floats and doubles. Fall back to generic code for
22111 // anything else.
22112 if (!isScalarFPTypeInSSEReg(SrcVT))
22113 return SDValue();
22114
22115 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22116 unsigned SatWidth = SatVT.getScalarSizeInBits();
22117 unsigned DstWidth = DstVT.getScalarSizeInBits();
22118 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22119 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22120, __extension__ __PRETTY_FUNCTION__))
22120 "Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22120, __extension__ __PRETTY_FUNCTION__))
;
22121
22122 // Promote result of FP_TO_*INT to at least 32 bits.
22123 if (TmpWidth < 32) {
22124 TmpVT = MVT::i32;
22125 TmpWidth = 32;
22126 }
22127
22128 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22129 // us to use a native signed conversion instead.
22130 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22131 TmpVT = MVT::i64;
22132 TmpWidth = 64;
22133 }
22134
22135 // If the saturation width is smaller than the size of the temporary result,
22136 // we can always use signed conversion, which is native.
22137 if (SatWidth < TmpWidth)
22138 FpToIntOpcode = ISD::FP_TO_SINT;
22139
22140 // Determine minimum and maximum integer values and their corresponding
22141 // floating-point values.
22142 APInt MinInt, MaxInt;
22143 if (IsSigned) {
22144 MinInt = APInt::getSignedMinValue(SatWidth).sextOrSelf(DstWidth);
22145 MaxInt = APInt::getSignedMaxValue(SatWidth).sextOrSelf(DstWidth);
22146 } else {
22147 MinInt = APInt::getMinValue(SatWidth).zextOrSelf(DstWidth);
22148 MaxInt = APInt::getMaxValue(SatWidth).zextOrSelf(DstWidth);
22149 }
22150
22151 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
22152 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
22153
22154 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22155 MinInt, IsSigned, APFloat::rmTowardZero);
22156 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22157 MaxInt, IsSigned, APFloat::rmTowardZero);
22158 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22159 && !(MaxStatus & APFloat::opStatus::opInexact);
22160
22161 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22162 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22163
22164 // If the integer bounds are exactly representable as floats, emit a
22165 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22166 if (AreExactFloatBounds) {
22167 if (DstVT != TmpVT) {
22168 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22169 SDValue MinClamped = DAG.getNode(
22170 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22171 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22172 SDValue BothClamped = DAG.getNode(
22173 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22174 // Convert clamped value to integer.
22175 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22176
22177 // NaN will become INDVAL, with the top bit set and the rest zero.
22178 // Truncation will discard the top bit, resulting in zero.
22179 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22180 }
22181
22182 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22183 SDValue MinClamped = DAG.getNode(
22184 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22185 // Clamp by MaxFloat from above. NaN cannot occur.
22186 SDValue BothClamped = DAG.getNode(
22187 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22188 // Convert clamped value to integer.
22189 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22190
22191 if (!IsSigned) {
22192 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22193 // which is zero.
22194 return FpToInt;
22195 }
22196
22197 // Otherwise, select zero if Src is NaN.
22198 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22199 return DAG.getSelectCC(
22200 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22201 }
22202
22203 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22204 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22205
22206 // Result of direct conversion, which may be selected away.
22207 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22208
22209 if (DstVT != TmpVT) {
22210 // NaN will become INDVAL, with the top bit set and the rest zero.
22211 // Truncation will discard the top bit, resulting in zero.
22212 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22213 }
22214
22215 SDValue Select = FpToInt;
22216 // For signed conversions where we saturate to the same size as the
22217 // result type of the fptoi instructions, INDVAL coincides with integer
22218 // minimum, so we don't need to explicitly check it.
22219 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22220 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22221 // MinInt if Src is NaN.
22222 Select = DAG.getSelectCC(
22223 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22224 }
22225
22226 // If Src OGT MaxFloat, select MaxInt.
22227 Select = DAG.getSelectCC(
22228 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22229
22230 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22231 // is already zero. The promoted case was already handled above.
22232 if (!IsSigned || DstVT != TmpVT) {
22233 return Select;
22234 }
22235
22236 // Otherwise, select 0 if Src is NaN.
22237 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22238 return DAG.getSelectCC(
22239 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22240}
22241
22242SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22243 bool IsStrict = Op->isStrictFPOpcode();
22244
22245 SDLoc DL(Op);
22246 MVT VT = Op.getSimpleValueType();
22247 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22248 MVT SVT = In.getSimpleValueType();
22249
22250 if (VT == MVT::f128)
22251 return SDValue();
22252
22253 if (VT == MVT::f80) {
22254 if (SVT == MVT::f16) {
22255 assert(Subtarget.hasFP16() && "Unexpected features!")(static_cast <bool> (Subtarget.hasFP16() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22255, __extension__ __PRETTY_FUNCTION__))
;
22256 RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
22257 MakeLibCallOptions CallOptions;
22258 std::pair<SDValue, SDValue> Tmp =
22259 makeLibCall(DAG, LC, VT, In, CallOptions, DL,
22260 IsStrict ? Op.getOperand(0) : SDValue());
22261 if (IsStrict)
22262 return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
22263 else
22264 return Tmp.first;
22265 }
22266 return Op;
22267 }
22268
22269 if (SVT.getVectorElementType() == MVT::f16) {
22270 assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (Subtarget.hasFP16() && Subtarget
.hasVLX() && "Unexpected features!") ? void (0) : __assert_fail
("Subtarget.hasFP16() && Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22270, __extension__ __PRETTY_FUNCTION__))
;
22271 if (SVT == MVT::v2f16)
22272 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22273 DAG.getUNDEF(MVT::v2f16));
22274 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22275 DAG.getUNDEF(MVT::v4f16));
22276 if (IsStrict)
22277 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22278 {Op->getOperand(0), Res});
22279 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22280 }
22281
22282 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22282, __extension__ __PRETTY_FUNCTION__))
;
22283
22284 SDValue Res =
22285 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22286 if (IsStrict)
22287 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22288 {Op->getOperand(0), Res});
22289 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22290}
22291
22292SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22293 bool IsStrict = Op->isStrictFPOpcode();
22294 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22295 MVT VT = Op.getSimpleValueType();
22296 MVT SVT = In.getSimpleValueType();
22297
22298 // It's legal except when f128 is involved or we're converting f80->f16.
22299 if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
22300 return Op;
22301
22302 return SDValue();
22303}
22304
22305static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
22306 bool IsStrict = Op->isStrictFPOpcode();
22307 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22308 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22309, __extension__ __PRETTY_FUNCTION__))
22309 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22309, __extension__ __PRETTY_FUNCTION__))
;
22310
22311 SDLoc dl(Op);
22312 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22313 DAG.getConstant(0, dl, MVT::v8i16), Src,
22314 DAG.getIntPtrConstant(0, dl));
22315
22316 SDValue Chain;
22317 if (IsStrict) {
22318 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22319 {Op.getOperand(0), Res});
22320 Chain = Res.getValue(1);
22321 } else {
22322 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22323 }
22324
22325 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22326 DAG.getIntPtrConstant(0, dl));
22327
22328 if (IsStrict)
22329 return DAG.getMergeValues({Res, Chain}, dl);
22330
22331 return Res;
22332}
22333
22334static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
22335 bool IsStrict = Op->isStrictFPOpcode();
22336 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22337 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22338, __extension__ __PRETTY_FUNCTION__))
22338 "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22338, __extension__ __PRETTY_FUNCTION__))
;
22339
22340 SDLoc dl(Op);
22341 SDValue Res, Chain;
22342 if (IsStrict) {
22343 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22344 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22345 DAG.getIntPtrConstant(0, dl));
22346 Res = DAG.getNode(
22347 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22348 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22349 Chain = Res.getValue(1);
22350 } else {
22351 // FIXME: Should we use zeros for upper elements for non-strict?
22352 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22353 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22354 DAG.getTargetConstant(4, dl, MVT::i32));
22355 }
22356
22357 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22358 DAG.getIntPtrConstant(0, dl));
22359
22360 if (IsStrict)
22361 return DAG.getMergeValues({Res, Chain}, dl);
22362
22363 return Res;
22364}
22365
22366/// Depending on uarch and/or optimizing for size, we might prefer to use a
22367/// vector operation in place of the typical scalar operation.
22368static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
22369 const X86Subtarget &Subtarget) {
22370 // If both operands have other uses, this is probably not profitable.
22371 SDValue LHS = Op.getOperand(0);
22372 SDValue RHS = Op.getOperand(1);
22373 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22374 return Op;
22375
22376 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22377 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22378 if (IsFP && !Subtarget.hasSSE3())
22379 return Op;
22380 if (!IsFP && !Subtarget.hasSSSE3())
22381 return Op;
22382
22383 // Extract from a common vector.
22384 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22385 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22386 LHS.getOperand(0) != RHS.getOperand(0) ||
22387 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22388 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22389 !shouldUseHorizontalOp(true, DAG, Subtarget))
22390 return Op;
22391
22392 // Allow commuted 'hadd' ops.
22393 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22394 unsigned HOpcode;
22395 switch (Op.getOpcode()) {
22396 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22397 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22398 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22399 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22400 default:
22401 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22401)
;
22402 }
22403 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22404 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22405 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22406 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22407 std::swap(LExtIndex, RExtIndex);
22408
22409 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22410 return Op;
22411
22412 SDValue X = LHS.getOperand(0);
22413 EVT VecVT = X.getValueType();
22414 unsigned BitWidth = VecVT.getSizeInBits();
22415 unsigned NumLanes = BitWidth / 128;
22416 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22417 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22418, __extension__ __PRETTY_FUNCTION__))
22418 "Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22418, __extension__ __PRETTY_FUNCTION__))
;
22419
22420 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22421 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22422 SDLoc DL(Op);
22423 if (BitWidth == 256 || BitWidth == 512) {
22424 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22425 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22426 LExtIndex %= NumEltsPerLane;
22427 }
22428
22429 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22430 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22431 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22432 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22433 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22434 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22435 DAG.getIntPtrConstant(LExtIndex / 2, DL));
22436}
22437
22438/// Depending on uarch and/or optimizing for size, we might prefer to use a
22439/// vector operation in place of the typical scalar operation.
22440SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22441 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22442, __extension__ __PRETTY_FUNCTION__))
22442 "Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22442, __extension__ __PRETTY_FUNCTION__))
;
22443 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
22444}
22445
22446/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22447/// This mode isn't supported in hardware on X86. But as long as we aren't
22448/// compiling with trapping math, we can emulate this with
22449/// floor(X + copysign(nextafter(0.5, 0.0), X)).
22450static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
22451 SDValue N0 = Op.getOperand(0);
22452 SDLoc dl(Op);
22453 MVT VT = Op.getSimpleValueType();
22454
22455 // N0 += copysign(nextafter(0.5, 0.0), N0)
22456 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22457 bool Ignored;
22458 APFloat Point5Pred = APFloat(0.5f);
22459 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22460 Point5Pred.next(/*nextDown*/true);
22461
22462 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22463 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22464 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22465
22466 // Truncate the result to remove fraction.
22467 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22468}
22469
22470/// The only differences between FABS and FNEG are the mask and the logic op.
22471/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22472static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
22473 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22474, __extension__ __PRETTY_FUNCTION__))
22474 "Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22474, __extension__ __PRETTY_FUNCTION__))
;
22475
22476 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22477
22478 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22479 // into an FNABS. We'll lower the FABS after that if it is still in use.
22480 if (IsFABS)
22481 for (SDNode *User : Op->uses())
22482 if (User->getOpcode() == ISD::FNEG)
22483 return Op;
22484
22485 SDLoc dl(Op);
22486 MVT VT = Op.getSimpleValueType();
22487
22488 bool IsF128 = (VT == MVT::f128);
22489 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22491, __extension__ __PRETTY_FUNCTION__))
22490 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22491, __extension__ __PRETTY_FUNCTION__))
22491 "Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22491, __extension__ __PRETTY_FUNCTION__))
;
22492
22493 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
22494 // decide if we should generate a 16-byte constant mask when we only need 4 or
22495 // 8 bytes for the scalar case.
22496
22497 // There are no scalar bitwise logical SSE/AVX instructions, so we
22498 // generate a 16-byte vector constant and logic op even for the scalar case.
22499 // Using a 16-byte mask allows folding the load of the mask with
22500 // the logic op, so it can save (~4 bytes) on code size.
22501 bool IsFakeVector = !VT.isVector() && !IsF128;
22502 MVT LogicVT = VT;
22503 if (IsFakeVector)
22504 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22505 : (VT == MVT::f32) ? MVT::v4f32
22506 : MVT::v8f16;
22507
22508 unsigned EltBits = VT.getScalarSizeInBits();
22509 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22510 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22511 APInt::getSignMask(EltBits);
22512 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22513 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22514
22515 SDValue Op0 = Op.getOperand(0);
22516 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22517 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22518 IsFNABS ? X86ISD::FOR :
22519 X86ISD::FXOR;
22520 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22521
22522 if (VT.isVector() || IsF128)
22523 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22524
22525 // For the scalar case extend to a 128-bit vector, perform the logic op,
22526 // and extract the scalar result back out.
22527 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22528 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22529 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22530 DAG.getIntPtrConstant(0, dl));
22531}
22532
22533static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
22534 SDValue Mag = Op.getOperand(0);
22535 SDValue Sign = Op.getOperand(1);
22536 SDLoc dl(Op);
22537
22538 // If the sign operand is smaller, extend it first.
22539 MVT VT = Op.getSimpleValueType();
22540 if (Sign.getSimpleValueType().bitsLT(VT))
22541 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22542
22543 // And if it is bigger, shrink it first.
22544 if (Sign.getSimpleValueType().bitsGT(VT))
22545 Sign =
22546 DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(0, dl));
22547
22548 // At this point the operands and the result should have the same
22549 // type, and that won't be f80 since that is not custom lowered.
22550 bool IsF128 = (VT == MVT::f128);
22551 assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22553, __extension__ __PRETTY_FUNCTION__))
22552 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22553, __extension__ __PRETTY_FUNCTION__))
22553 "Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22553, __extension__ __PRETTY_FUNCTION__))
;
22554
22555 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
22556
22557 // Perform all scalar logic operations as 16-byte vectors because there are no
22558 // scalar FP logic instructions in SSE.
22559 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22560 // unnecessary splats, but we might miss load folding opportunities. Should
22561 // this decision be based on OptimizeForSize?
22562 bool IsFakeVector = !VT.isVector() && !IsF128;
22563 MVT LogicVT = VT;
22564 if (IsFakeVector)
22565 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22566 : (VT == MVT::f32) ? MVT::v4f32
22567 : MVT::v8f16;
22568
22569 // The mask constants are automatically splatted for vector types.
22570 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22571 SDValue SignMask = DAG.getConstantFP(
22572 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22573 SDValue MagMask = DAG.getConstantFP(
22574 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22575
22576 // First, clear all bits but the sign bit from the second operand (sign).
22577 if (IsFakeVector)
22578 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22579 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22580
22581 // Next, clear the sign bit from the first operand (magnitude).
22582 // TODO: If we had general constant folding for FP logic ops, this check
22583 // wouldn't be necessary.
22584 SDValue MagBits;
22585 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22586 APFloat APF = Op0CN->getValueAPF();
22587 APF.clearSign();
22588 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22589 } else {
22590 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22591 if (IsFakeVector)
22592 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22593 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22594 }
22595
22596 // OR the magnitude value with the sign bit.
22597 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22598 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22599 DAG.getIntPtrConstant(0, dl));
22600}
22601
22602static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
22603 SDValue N0 = Op.getOperand(0);
22604 SDLoc dl(Op);
22605 MVT VT = Op.getSimpleValueType();
22606
22607 MVT OpVT = N0.getSimpleValueType();
22608 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22609, __extension__ __PRETTY_FUNCTION__))
22609 "Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22609, __extension__ __PRETTY_FUNCTION__))
;
22610
22611 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22612 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22613 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22614 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22615 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22616 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22617 return Res;
22618}
22619
22620static SDValue lowerISNAN(SDValue Op, SelectionDAG &DAG) {
22621 SDLoc DL(Op);
22622 SDValue Arg = Op.getOperand(0);
22623 MVT ArgVT = Arg.getSimpleValueType();
22624 MVT ResultVT = Op.getSimpleValueType();
22625
22626 // If exceptions are ignored, use unordered comparison for fp80. It recognizes
22627 // unsupported values as NaNs.
22628 if (ArgVT == MVT::f80 && Op->getFlags().hasNoFPExcept())
22629 return DAG.getSetCC(DL, ResultVT, Arg, Arg, ISD::CondCode::SETUNE);
22630
22631 // Determine classification of argument using instruction FXAM.
22632 unsigned Opc;
22633 switch (ArgVT.SimpleTy) {
22634 default:
22635 llvm_unreachable("Unexpected type!")::llvm::llvm_unreachable_internal("Unexpected type!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22635)
;
22636 case MVT::f32:
22637 Opc = X86::XAM_Fp32;
22638 break;
22639 case MVT::f64:
22640 Opc = X86::XAM_Fp64;
22641 break;
22642 case MVT::f80:
22643 Opc = X86::XAM_Fp80;
22644 break;
22645 }
22646 SDValue Test(DAG.getMachineNode(Opc, DL, MVT::Glue, Arg), 0);
22647
22648 // Move FPSW to AX.
22649 SDValue FNSTSW =
22650 SDValue(DAG.getMachineNode(X86::FNSTSW16r, DL, MVT::i16, Test), 0);
22651
22652 // Extract upper 8-bits of AX.
22653 SDValue Extract =
22654 DAG.getTargetExtractSubreg(X86::sub_8bit_hi, DL, MVT::i8, FNSTSW);
22655
22656 // Mask all bits but C3, C2, C0.
22657 Extract = DAG.getNode(ISD::AND, DL, MVT::i8, Extract,
22658 DAG.getConstant(0x45, DL, MVT::i8));
22659
22660 return DAG.getSetCC(DL, ResultVT, Extract, DAG.getConstant(1, DL, MVT::i8),
22661 ISD::CondCode::SETLE);
22662}
22663
22664/// Helper for creating a X86ISD::SETCC node.
22665static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
22666 SelectionDAG &DAG) {
22667 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22668 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22669}
22670
22671/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
22672/// style scalarized (associative) reduction patterns. Partial reductions
22673/// are supported when the pointer SrcMask is non-null.
22674/// TODO - move this to SelectionDAG?
22675static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
22676 SmallVectorImpl<SDValue> &SrcOps,
22677 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22678 SmallVector<SDValue, 8> Opnds;
22679 DenseMap<SDValue, APInt> SrcOpMap;
22680 EVT VT = MVT::Other;
22681
22682 // Recognize a special case where a vector is casted into wide integer to
22683 // test all 0s.
22684 assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22685, __extension__ __PRETTY_FUNCTION__))
22685 "Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22685, __extension__ __PRETTY_FUNCTION__))
;
22686 Opnds.push_back(Op.getOperand(0));
22687 Opnds.push_back(Op.getOperand(1));
22688
22689 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22690 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
22691 // BFS traverse all BinOp operands.
22692 if (I->getOpcode() == unsigned(BinOp)) {
22693 Opnds.push_back(I->getOperand(0));
22694 Opnds.push_back(I->getOperand(1));
22695 // Re-evaluate the number of nodes to be traversed.
22696 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22697 continue;
22698 }
22699
22700 // Quit if a non-EXTRACT_VECTOR_ELT
22701 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22702 return false;
22703
22704 // Quit if without a constant index.
22705 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22706 if (!Idx)
22707 return false;
22708
22709 SDValue Src = I->getOperand(0);
22710 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22711 if (M == SrcOpMap.end()) {
22712 VT = Src.getValueType();
22713 // Quit if not the same type.
22714 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22715 return false;
22716 unsigned NumElts = VT.getVectorNumElements();
22717 APInt EltCount = APInt::getNullValue(NumElts);
22718 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22719 SrcOps.push_back(Src);
22720 }
22721
22722 // Quit if element already used.
22723 unsigned CIdx = Idx->getZExtValue();
22724 if (M->second[CIdx])
22725 return false;
22726 M->second.setBit(CIdx);
22727 }
22728
22729 if (SrcMask) {
22730 // Collect the source partial masks.
22731 for (SDValue &SrcOp : SrcOps)
22732 SrcMask->push_back(SrcOpMap[SrcOp]);
22733 } else {
22734 // Quit if not all elements are used.
22735 for (const auto &I : SrcOpMap)
22736 if (!I.second.isAllOnesValue())
22737 return false;
22738 }
22739
22740 return true;
22741}
22742
22743// Helper function for comparing all bits of a vector against zero.
22744static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
22745 const APInt &Mask,
22746 const X86Subtarget &Subtarget,
22747 SelectionDAG &DAG, X86::CondCode &X86CC) {
22748 EVT VT = V.getValueType();
22749 unsigned ScalarSize = VT.getScalarSizeInBits();
22750 if (Mask.getBitWidth() != ScalarSize) {
22751 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22751, __extension__ __PRETTY_FUNCTION__))
;
22752 return SDValue();
22753 }
22754
22755 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22755, __extension__ __PRETTY_FUNCTION__))
;
22756 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22757
22758 auto MaskBits = [&](SDValue Src) {
22759 if (Mask.isAllOnesValue())
22760 return Src;
22761 EVT SrcVT = Src.getValueType();
22762 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22763 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22764 };
22765
22766 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22767 if (VT.getSizeInBits() < 128) {
22768 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22769 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
22770 return SDValue();
22771 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22772 DAG.getBitcast(IntVT, MaskBits(V)),
22773 DAG.getConstant(0, DL, IntVT));
22774 }
22775
22776 // Quit if not splittable to 128/256-bit vector.
22777 if (!isPowerOf2_32(VT.getSizeInBits()))
22778 return SDValue();
22779
22780 // Split down to 128/256-bit vector.
22781 unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
22782 while (VT.getSizeInBits() > TestSize) {
22783 auto Split = DAG.SplitVector(V, DL);
22784 VT = Split.first.getValueType();
22785 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22786 }
22787
22788 bool UsePTEST = Subtarget.hasSSE41();
22789 if (UsePTEST) {
22790 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
22791 V = DAG.getBitcast(TestVT, MaskBits(V));
22792 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22793 }
22794
22795 // Without PTEST, a masked v2i64 or-reduction is not faster than
22796 // scalarization.
22797 if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
22798 return SDValue();
22799
22800 V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
22801 V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
22802 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
22803 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22804 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22805 DAG.getConstant(0xFFFF, DL, MVT::i32));
22806}
22807
22808// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
22809// CMP(MOVMSK(PCMPEQB(X,0))).
22810static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
22811 const SDLoc &DL,
22812 const X86Subtarget &Subtarget,
22813 SelectionDAG &DAG, SDValue &X86CC) {
22814 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22814, __extension__ __PRETTY_FUNCTION__))
;
22815
22816 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22817 return SDValue();
22818
22819 // Check whether we're masking/truncating an OR-reduction result, in which
22820 // case track the masked bits.
22821 APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
22822 switch (Op.getOpcode()) {
22823 case ISD::TRUNCATE: {
22824 SDValue Src = Op.getOperand(0);
22825 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22826 Op.getScalarValueSizeInBits());
22827 Op = Src;
22828 break;
22829 }
22830 case ISD::AND: {
22831 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22832 Mask = Cst->getAPIntValue();
22833 Op = Op.getOperand(0);
22834 }
22835 break;
22836 }
22837 }
22838
22839 SmallVector<SDValue, 8> VecIns;
22840 if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
22841 EVT VT = VecIns[0].getValueType();
22842 assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22844, __extension__ __PRETTY_FUNCTION__))
22843 [VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22844, __extension__ __PRETTY_FUNCTION__))
22844 "Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22844, __extension__ __PRETTY_FUNCTION__))
;
22845
22846 // Quit if less than 128-bits or not splittable to 128/256-bit vector.
22847 if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
22848 return SDValue();
22849
22850 // If more than one full vector is evaluated, OR them first before PTEST.
22851 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22852 Slot += 2, e += 1) {
22853 // Each iteration will OR 2 nodes and append the result until there is
22854 // only 1 node left, i.e. the final OR'd value of all vectors.
22855 SDValue LHS = VecIns[Slot];
22856 SDValue RHS = VecIns[Slot + 1];
22857 VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
22858 }
22859
22860 X86::CondCode CCode;
22861 if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
22862 DAG, CCode)) {
22863 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22864 return V;
22865 }
22866 }
22867
22868 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22869 ISD::NodeType BinOp;
22870 if (SDValue Match =
22871 DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
22872 X86::CondCode CCode;
22873 if (SDValue V =
22874 LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
22875 X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
22876 return V;
22877 }
22878 }
22879 }
22880
22881 return SDValue();
22882}
22883
22884/// return true if \c Op has a use that doesn't just read flags.
22885static bool hasNonFlagsUse(SDValue Op) {
22886 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22887 ++UI) {
22888 SDNode *User = *UI;
22889 unsigned UOpNo = UI.getOperandNo();
22890 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22891 // Look pass truncate.
22892 UOpNo = User->use_begin().getOperandNo();
22893 User = *User->use_begin();
22894 }
22895
22896 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22897 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22898 return true;
22899 }
22900 return false;
22901}
22902
22903// Transform to an x86-specific ALU node with flags if there is a chance of
22904// using an RMW op or only the flags are used. Otherwise, leave
22905// the node alone and emit a 'cmp' or 'test' instruction.
22906static bool isProfitableToUseFlagOp(SDValue Op) {
22907 for (SDNode *U : Op->uses())
22908 if (U->getOpcode() != ISD::CopyToReg &&
22909 U->getOpcode() != ISD::SETCC &&
22910 U->getOpcode() != ISD::STORE)
22911 return false;
22912
22913 return true;
22914}
22915
22916/// Emit nodes that will be selected as "test Op0,Op0", or something
22917/// equivalent.
22918static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22919 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22920 // CF and OF aren't always set the way we want. Determine which
22921 // of these we need.
22922 bool NeedCF = false;
22923 bool NeedOF = false;
22924 switch (X86CC) {
22925 default: break;
22926 case X86::COND_A: case X86::COND_AE:
22927 case X86::COND_B: case X86::COND_BE:
22928 NeedCF = true;
22929 break;
22930 case X86::COND_G: case X86::COND_GE:
22931 case X86::COND_L: case X86::COND_LE:
22932 case X86::COND_O: case X86::COND_NO: {
22933 // Check if we really need to set the
22934 // Overflow flag. If NoSignedWrap is present
22935 // that is not actually needed.
22936 switch (Op->getOpcode()) {
22937 case ISD::ADD:
22938 case ISD::SUB:
22939 case ISD::MUL:
22940 case ISD::SHL:
22941 if (Op.getNode()->getFlags().hasNoSignedWrap())
22942 break;
22943 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22944 default:
22945 NeedOF = true;
22946 break;
22947 }
22948 break;
22949 }
22950 }
22951 // See if we can use the EFLAGS value from the operand instead of
22952 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22953 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22954 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22955 // Emit a CMP with 0, which is the TEST pattern.
22956 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22957 DAG.getConstant(0, dl, Op.getValueType()));
22958 }
22959 unsigned Opcode = 0;
22960 unsigned NumOperands = 0;
22961
22962 SDValue ArithOp = Op;
22963
22964 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22965 // which may be the result of a CAST. We use the variable 'Op', which is the
22966 // non-casted variable when we check for possible users.
22967 switch (ArithOp.getOpcode()) {
22968 case ISD::AND:
22969 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22970 // because a TEST instruction will be better.
22971 if (!hasNonFlagsUse(Op))
22972 break;
22973
22974 LLVM_FALLTHROUGH[[gnu::fallthrough]];
22975 case ISD::ADD:
22976 case ISD::SUB:
22977 case ISD::OR:
22978 case ISD::XOR:
22979 if (!isProfitableToUseFlagOp(Op))
22980 break;
22981
22982 // Otherwise use a regular EFLAGS-setting instruction.
22983 switch (ArithOp.getOpcode()) {
22984 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22984)
;
22985 case ISD::ADD: Opcode = X86ISD::ADD; break;
22986 case ISD::SUB: Opcode = X86ISD::SUB; break;
22987 case ISD::XOR: Opcode = X86ISD::XOR; break;
22988 case ISD::AND: Opcode = X86ISD::AND; break;
22989 case ISD::OR: Opcode = X86ISD::OR; break;
22990 }
22991
22992 NumOperands = 2;
22993 break;
22994 case X86ISD::ADD:
22995 case X86ISD::SUB:
22996 case X86ISD::OR:
22997 case X86ISD::XOR:
22998 case X86ISD::AND:
22999 return SDValue(Op.getNode(), 1);
23000 case ISD::SSUBO:
23001 case ISD::USUBO: {
23002 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23003 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23004 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23005 Op->getOperand(1)).getValue(1);
23006 }
23007 default:
23008 break;
23009 }
23010
23011 if (Opcode == 0) {
23012 // Emit a CMP with 0, which is the TEST pattern.
23013 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23014 DAG.getConstant(0, dl, Op.getValueType()));
23015 }
23016 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23017 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
23018
23019 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23020 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23021 return SDValue(New.getNode(), 1);
23022}
23023
23024/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23025/// equivalent.
23026static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
23027 const SDLoc &dl, SelectionDAG &DAG,
23028 const X86Subtarget &Subtarget) {
23029 if (isNullConstant(Op1))
23030 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23031
23032 EVT CmpVT = Op0.getValueType();
23033
23034 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23035, __extension__ __PRETTY_FUNCTION__))
23035 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23035, __extension__ __PRETTY_FUNCTION__))
;
23036
23037 // Only promote the compare up to I32 if it is a 16 bit operation
23038 // with an immediate. 16 bit immediates are to be avoided.
23039 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
23040 !DAG.getMachineFunction().getFunction().hasMinSize()) {
23041 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
23042 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
23043 // Don't do this if the immediate can fit in 8-bits.
23044 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23045 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23046 unsigned ExtendOp =
23047 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23048 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23049 // For equality comparisons try to use SIGN_EXTEND if the input was
23050 // truncate from something with enough sign bits.
23051 if (Op0.getOpcode() == ISD::TRUNCATE) {
23052 SDValue In = Op0.getOperand(0);
23053 unsigned EffBits =
23054 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
23055 if (EffBits <= 16)
23056 ExtendOp = ISD::SIGN_EXTEND;
23057 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23058 SDValue In = Op1.getOperand(0);
23059 unsigned EffBits =
23060 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
23061 if (EffBits <= 16)
23062 ExtendOp = ISD::SIGN_EXTEND;
23063 }
23064 }
23065
23066 CmpVT = MVT::i32;
23067 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23068 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23069 }
23070 }
23071
23072 // Try to shrink i64 compares if the input has enough zero bits.
23073 // FIXME: Do this for non-constant compares for constant on LHS?
23074 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
23075 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23076 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
23077 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23078 CmpVT = MVT::i32;
23079 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23080 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23081 }
23082
23083 // 0-x == y --> x+y == 0
23084 // 0-x != y --> x+y != 0
23085 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23086 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23087 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23088 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23089 return Add.getValue(1);
23090 }
23091
23092 // x == 0-y --> x+y == 0
23093 // x != 0-y --> x+y != 0
23094 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23095 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23096 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23097 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23098 return Add.getValue(1);
23099 }
23100
23101 // Use SUB instead of CMP to enable CSE between SUB and CMP.
23102 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23103 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
23104 return Sub.getValue(1);
23105}
23106
23107/// Check if replacement of SQRT with RSQRT should be disabled.
23108bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23109 EVT VT = Op.getValueType();
23110
23111 // We never want to use both SQRT and RSQRT instructions for the same input.
23112 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23113 return false;
23114
23115 if (VT.isVector())
23116 return Subtarget.hasFastVectorFSQRT();
23117 return Subtarget.hasFastScalarFSQRT();
23118}
23119
23120/// The minimum architected relative accuracy is 2^-12. We need one
23121/// Newton-Raphson step to have a good float result (24 bits of precision).
23122SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23123 SelectionDAG &DAG, int Enabled,
23124 int &RefinementSteps,
23125 bool &UseOneConstNR,
23126 bool Reciprocal) const {
23127 EVT VT = Op.getValueType();
23128
23129 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23130 // It is likely not profitable to do this for f64 because a double-precision
23131 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23132 // instructions: convert to single, rsqrtss, convert back to double, refine
23133 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23134 // along with FMA, this could be a throughput win.
23135 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23136 // after legalize types.
23137 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23138 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23139 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23140 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23141 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23142 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23143 RefinementSteps = 1;
23144
23145 UseOneConstNR = false;
23146 // There is no FSQRT for 512-bits, but there is RSQRT14.
23147 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23148 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
23149 }
23150 return SDValue();
23151}
23152
23153/// The minimum architected relative accuracy is 2^-12. We need one
23154/// Newton-Raphson step to have a good float result (24 bits of precision).
23155SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23156 int Enabled,
23157 int &RefinementSteps) const {
23158 EVT VT = Op.getValueType();
23159
23160 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23161 // It is likely not profitable to do this for f64 because a double-precision
23162 // reciprocal estimate with refinement on x86 prior to FMA requires
23163 // 15 instructions: convert to single, rcpss, convert back to double, refine
23164 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23165 // along with FMA, this could be a throughput win.
23166
23167 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23168 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23169 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23170 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23171 // Enable estimate codegen with 1 refinement step for vector division.
23172 // Scalar division estimates are disabled because they break too much
23173 // real-world code. These defaults are intended to match GCC behavior.
23174 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23175 return SDValue();
23176
23177 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23178 RefinementSteps = 1;
23179
23180 // There is no FSQRT for 512-bits, but there is RCP14.
23181 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23182 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
23183 }
23184 return SDValue();
23185}
23186
23187/// If we have at least two divisions that use the same divisor, convert to
23188/// multiplication by a reciprocal. This may need to be adjusted for a given
23189/// CPU if a division's cost is not at least twice the cost of a multiplication.
23190/// This is because we still need one division to calculate the reciprocal and
23191/// then we need two multiplies by that reciprocal as replacements for the
23192/// original divisions.
23193unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
23194 return 2;
23195}
23196
23197SDValue
23198X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23199 SelectionDAG &DAG,
23200 SmallVectorImpl<SDNode *> &Created) const {
23201 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23202 if (isIntDivCheap(N->getValueType(0), Attr))
23203 return SDValue(N,0); // Lower SDIV as SDIV
23204
23205 assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || (-Divisor
).isPowerOf2()) && "Unexpected divisor!") ? void (0) :
__assert_fail ("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23206, __extension__ __PRETTY_FUNCTION__))
23206 "Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || (-Divisor
).isPowerOf2()) && "Unexpected divisor!") ? void (0) :
__assert_fail ("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23206, __extension__ __PRETTY_FUNCTION__))
;
23207
23208 // Only perform this transform if CMOV is supported otherwise the select
23209 // below will become a branch.
23210 if (!Subtarget.hasCMov())
23211 return SDValue();
23212
23213 // fold (sdiv X, pow2)
23214 EVT VT = N->getValueType(0);
23215 // FIXME: Support i8.
23216 if (VT != MVT::i16 && VT != MVT::i32 &&
23217 !(Subtarget.is64Bit() && VT == MVT::i64))
23218 return SDValue();
23219
23220 unsigned Lg2 = Divisor.countTrailingZeros();
23221
23222 // If the divisor is 2 or -2, the default expansion is better.
23223 if (Lg2 == 1)
23224 return SDValue();
23225
23226 SDLoc DL(N);
23227 SDValue N0 = N->getOperand(0);
23228 SDValue Zero = DAG.getConstant(0, DL, VT);
23229 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
23230 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
23231
23232 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
23233 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
23234 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
23235 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
23236
23237 Created.push_back(Cmp.getNode());
23238 Created.push_back(Add.getNode());
23239 Created.push_back(CMov.getNode());
23240
23241 // Divide by pow2.
23242 SDValue SRA =
23243 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
23244
23245 // If we're dividing by a positive value, we're done. Otherwise, we must
23246 // negate the result.
23247 if (Divisor.isNonNegative())
23248 return SRA;
23249
23250 Created.push_back(SRA.getNode());
23251 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
23252}
23253
23254/// Result of 'and' is compared against zero. Change to a BT node if possible.
23255/// Returns the BT node and the condition code needed to use it.
23256static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
23257 const SDLoc &dl, SelectionDAG &DAG,
23258 SDValue &X86CC) {
23259 assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23259, __extension__ __PRETTY_FUNCTION__))
;
23260 SDValue Op0 = And.getOperand(0);
23261 SDValue Op1 = And.getOperand(1);
23262 if (Op0.getOpcode() == ISD::TRUNCATE)
23263 Op0 = Op0.getOperand(0);
23264 if (Op1.getOpcode() == ISD::TRUNCATE)
23265 Op1 = Op1.getOperand(0);
23266
23267 SDValue Src, BitNo;
23268 if (Op1.getOpcode() == ISD::SHL)
23269 std::swap(Op0, Op1);
23270 if (Op0.getOpcode() == ISD::SHL) {
23271 if (isOneConstant(Op0.getOperand(0))) {
23272 // If we looked past a truncate, check that it's only truncating away
23273 // known zeros.
23274 unsigned BitWidth = Op0.getValueSizeInBits();
23275 unsigned AndBitWidth = And.getValueSizeInBits();
23276 if (BitWidth > AndBitWidth) {
23277 KnownBits Known = DAG.computeKnownBits(Op0);
23278 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23279 return SDValue();
23280 }
23281 Src = Op1;
23282 BitNo = Op0.getOperand(1);
23283 }
23284 } else if (Op1.getOpcode() == ISD::Constant) {
23285 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23286 uint64_t AndRHSVal = AndRHS->getZExtValue();
23287 SDValue AndLHS = Op0;
23288
23289 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23290 Src = AndLHS.getOperand(0);
23291 BitNo = AndLHS.getOperand(1);
23292 } else {
23293 // Use BT if the immediate can't be encoded in a TEST instruction or we
23294 // are optimizing for size and the immedaite won't fit in a byte.
23295 bool OptForSize = DAG.shouldOptForSize();
23296 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23297 isPowerOf2_64(AndRHSVal)) {
23298 Src = AndLHS;
23299 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23300 Src.getValueType());
23301 }
23302 }
23303 }
23304
23305 // No patterns found, give up.
23306 if (!Src.getNode())
23307 return SDValue();
23308
23309 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
23310 // instruction. Since the shift amount is in-range-or-undefined, we know
23311 // that doing a bittest on the i32 value is ok. We extend to i32 because
23312 // the encoding for the i16 version is larger than the i32 version.
23313 // Also promote i16 to i32 for performance / code size reason.
23314 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
23315 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
23316
23317 // See if we can use the 32-bit instruction instead of the 64-bit one for a
23318 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
23319 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
23320 // known to be zero.
23321 if (Src.getValueType() == MVT::i64 &&
23322 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
23323 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
23324
23325 // If the operand types disagree, extend the shift amount to match. Since
23326 // BT ignores high bits (like shifts) we can use anyextend.
23327 if (Src.getValueType() != BitNo.getValueType())
23328 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
23329
23330 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
23331 dl, MVT::i8);
23332 return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
23333}
23334
23335/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23336/// CMPs.
23337static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23338 SDValue &Op1, bool &IsAlwaysSignaling) {
23339 unsigned SSECC;
23340 bool Swap = false;
23341
23342 // SSE Condition code mapping:
23343 // 0 - EQ
23344 // 1 - LT
23345 // 2 - LE
23346 // 3 - UNORD
23347 // 4 - NEQ
23348 // 5 - NLT
23349 // 6 - NLE
23350 // 7 - ORD
23351 switch (SetCCOpcode) {
23352 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23352)
;
23353 case ISD::SETOEQ:
23354 case ISD::SETEQ: SSECC = 0; break;
23355 case ISD::SETOGT:
23356 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23357 case ISD::SETLT:
23358 case ISD::SETOLT: SSECC = 1; break;
23359 case ISD::SETOGE:
23360 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23361 case ISD::SETLE:
23362 case ISD::SETOLE: SSECC = 2; break;
23363 case ISD::SETUO: SSECC = 3; break;
23364 case ISD::SETUNE:
23365 case ISD::SETNE: SSECC = 4; break;
23366 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23367 case ISD::SETUGE: SSECC = 5; break;
23368 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23369 case ISD::SETUGT: SSECC = 6; break;
23370 case ISD::SETO: SSECC = 7; break;
23371 case ISD::SETUEQ: SSECC = 8; break;
23372 case ISD::SETONE: SSECC = 12; break;
23373 }
23374 if (Swap)
23375 std::swap(Op0, Op1);
23376
23377 switch (SetCCOpcode) {
23378 default:
23379 IsAlwaysSignaling = true;
23380 break;
23381 case ISD::SETEQ:
23382 case ISD::SETOEQ:
23383 case ISD::SETUEQ:
23384 case ISD::SETNE:
23385 case ISD::SETONE:
23386 case ISD::SETUNE:
23387 case ISD::SETO:
23388 case ISD::SETUO:
23389 IsAlwaysSignaling = false;
23390 break;
23391 }
23392
23393 return SSECC;
23394}
23395
23396/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23397/// concatenate the result back.
23398static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,
23399 ISD::CondCode Cond, SelectionDAG &DAG,
23400 const SDLoc &dl) {
23401 assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23402, __extension__ __PRETTY_FUNCTION__))
23402 VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23402, __extension__ __PRETTY_FUNCTION__))
;
23403
23404 SDValue CC = DAG.getCondCode(Cond);
23405
23406 // Extract the LHS Lo/Hi vectors
23407 SDValue LHS1, LHS2;
23408 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23409
23410 // Extract the RHS Lo/Hi vectors
23411 SDValue RHS1, RHS2;
23412 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23413
23414 // Issue the operation on the smaller types and concatenate the result back
23415 EVT LoVT, HiVT;
23416 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23417 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23418 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23419 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23420}
23421
23422static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
23423
23424 SDValue Op0 = Op.getOperand(0);
23425 SDValue Op1 = Op.getOperand(1);
23426 SDValue CC = Op.getOperand(2);
23427 MVT VT = Op.getSimpleValueType();
23428 SDLoc dl(Op);
23429
23430 assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23431, __extension__ __PRETTY_FUNCTION__))
23431 "Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23431, __extension__ __PRETTY_FUNCTION__))
;
23432
23433 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23434
23435 // Prefer SETGT over SETLT.
23436 if (SetCCOpcode == ISD::SETLT) {
23437 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23438 std::swap(Op0, Op1);
23439 }
23440
23441 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23442}
23443
23444/// Given a buildvector constant, return a new vector constant with each element
23445/// incremented or decremented. If incrementing or decrementing would result in
23446/// unsigned overflow or underflow or this is not a simple vector constant,
23447/// return an empty value.
23448static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
23449 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23450 if (!BV)
23451 return SDValue();
23452
23453 MVT VT = V.getSimpleValueType();
23454 MVT EltVT = VT.getVectorElementType();
23455 unsigned NumElts = VT.getVectorNumElements();
23456 SmallVector<SDValue, 8> NewVecC;
23457 SDLoc DL(V);
23458 for (unsigned i = 0; i < NumElts; ++i) {
23459 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23460 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23461 return SDValue();
23462
23463 // Avoid overflow/underflow.
23464 const APInt &EltC = Elt->getAPIntValue();
23465 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
23466 return SDValue();
23467
23468 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23469 }
23470
23471 return DAG.getBuildVector(VT, DL, NewVecC);
23472}
23473
23474/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23475/// Op0 u<= Op1:
23476/// t = psubus Op0, Op1
23477/// pcmpeq t, <0..0>
23478static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
23479 ISD::CondCode Cond, const SDLoc &dl,
23480 const X86Subtarget &Subtarget,
23481 SelectionDAG &DAG) {
23482 if (!Subtarget.hasSSE2())
23483 return SDValue();
23484
23485 MVT VET = VT.getVectorElementType();
23486 if (VET != MVT::i8 && VET != MVT::i16)
23487 return SDValue();
23488
23489 switch (Cond) {
23490 default:
23491 return SDValue();
23492 case ISD::SETULT: {
23493 // If the comparison is against a constant we can turn this into a
23494 // setule. With psubus, setule does not require a swap. This is
23495 // beneficial because the constant in the register is no longer
23496 // destructed as the destination so it can be hoisted out of a loop.
23497 // Only do this pre-AVX since vpcmp* is no longer destructive.
23498 if (Subtarget.hasAVX())
23499 return SDValue();
23500 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
23501 if (!ULEOp1)
23502 return SDValue();
23503 Op1 = ULEOp1;
23504 break;
23505 }
23506 case ISD::SETUGT: {
23507 // If the comparison is against a constant, we can turn this into a setuge.
23508 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23509 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23510 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23511 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
23512 if (!UGEOp1)
23513 return SDValue();
23514 Op1 = Op0;
23515 Op0 = UGEOp1;
23516 break;
23517 }
23518 // Psubus is better than flip-sign because it requires no inversion.
23519 case ISD::SETUGE:
23520 std::swap(Op0, Op1);
23521 break;
23522 case ISD::SETULE:
23523 break;
23524 }
23525
23526 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23527 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23528 DAG.getConstant(0, dl, VT));
23529}
23530
23531static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23532 SelectionDAG &DAG) {
23533 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23534 Op.getOpcode() == ISD::STRICT_FSETCCS;
23535 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23536 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23537 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23538 MVT VT = Op->getSimpleValueType(0);
23539 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23540 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23541 SDLoc dl(Op);
23542
23543 if (isFP) {
23544#ifndef NDEBUG
23545 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
23546 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23546, __extension__ __PRETTY_FUNCTION__))
;
23547#endif
23548
23549 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23550 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23551
23552 // If we have a strict compare with a vXi1 result and the input is 128/256
23553 // bits we can't use a masked compare unless we have VLX. If we use a wider
23554 // compare like we do for non-strict, we might trigger spurious exceptions
23555 // from the upper elements. Instead emit a AVX compare and convert to mask.
23556 unsigned Opc;
23557 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23558 (!IsStrict || Subtarget.hasVLX() ||
23559 Op0.getSimpleValueType().is512BitVector())) {
23560#ifndef NDEBUG
23561 unsigned Num = VT.getVectorNumElements();
23562 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23562, __extension__ __PRETTY_FUNCTION__))
;
23563#endif
23564 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23565 } else {
23566 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23567 // The SSE/AVX packed FP comparison nodes are defined with a
23568 // floating-point vector result that matches the operand type. This allows
23569 // them to work with an SSE1 target (integer vector types are not legal).
23570 VT = Op0.getSimpleValueType();
23571 }
23572
23573 SDValue Cmp;
23574 bool IsAlwaysSignaling;
23575 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23576 if (!Subtarget.hasAVX()) {
23577 // TODO: We could use following steps to handle a quiet compare with
23578 // signaling encodings.
23579 // 1. Get ordered masks from a quiet ISD::SETO
23580 // 2. Use the masks to mask potential unordered elements in operand A, B
23581 // 3. Get the compare results of masked A, B
23582 // 4. Calculating final result using the mask and result from 3
23583 // But currently, we just fall back to scalar operations.
23584 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23585 return SDValue();
23586
23587 // Insert an extra signaling instruction to raise exception.
23588 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23589 SDValue SignalCmp = DAG.getNode(
23590 Opc, dl, {VT, MVT::Other},
23591 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23592 // FIXME: It seems we need to update the flags of all new strict nodes.
23593 // Otherwise, mayRaiseFPException in MI will return false due to
23594 // NoFPExcept = false by default. However, I didn't find it in other
23595 // patches.
23596 SignalCmp->setFlags(Op->getFlags());
23597 Chain = SignalCmp.getValue(1);
23598 }
23599
23600 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23601 // emit two comparisons and a logic op to tie them together.
23602 if (SSECC >= 8) {
23603 // LLVM predicate is SETUEQ or SETONE.
23604 unsigned CC0, CC1;
23605 unsigned CombineOpc;
23606 if (Cond == ISD::SETUEQ) {
23607 CC0 = 3; // UNORD
23608 CC1 = 0; // EQ
23609 CombineOpc = X86ISD::FOR;
23610 } else {
23611 assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23611, __extension__ __PRETTY_FUNCTION__))
;
23612 CC0 = 7; // ORD
23613 CC1 = 4; // NEQ
23614 CombineOpc = X86ISD::FAND;
23615 }
23616
23617 SDValue Cmp0, Cmp1;
23618 if (IsStrict) {
23619 Cmp0 = DAG.getNode(
23620 Opc, dl, {VT, MVT::Other},
23621 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23622 Cmp1 = DAG.getNode(
23623 Opc, dl, {VT, MVT::Other},
23624 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23625 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23626 Cmp1.getValue(1));
23627 } else {
23628 Cmp0 = DAG.getNode(
23629 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23630 Cmp1 = DAG.getNode(
23631 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23632 }
23633 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23634 } else {
23635 if (IsStrict) {
23636 Cmp = DAG.getNode(
23637 Opc, dl, {VT, MVT::Other},
23638 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23639 Chain = Cmp.getValue(1);
23640 } else
23641 Cmp = DAG.getNode(
23642 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23643 }
23644 } else {
23645 // Handle all other FP comparisons here.
23646 if (IsStrict) {
23647 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23648 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23649 Cmp = DAG.getNode(
23650 Opc, dl, {VT, MVT::Other},
23651 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23652 Chain = Cmp.getValue(1);
23653 } else
23654 Cmp = DAG.getNode(
23655 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23656 }
23657
23658 if (VT.getFixedSizeInBits() >
23659 Op.getSimpleValueType().getFixedSizeInBits()) {
23660 // We emitted a compare with an XMM/YMM result. Finish converting to a
23661 // mask register using a vptestm.
23662 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
23663 Cmp = DAG.getBitcast(CastVT, Cmp);
23664 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23665 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23666 } else {
23667 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23668 // the result type of SETCC. The bitcast is expected to be optimized
23669 // away during combining/isel.
23670 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23671 }
23672
23673 if (IsStrict)
23674 return DAG.getMergeValues({Cmp, Chain}, dl);
23675
23676 return Cmp;
23677 }
23678
23679 assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23679, __extension__ __PRETTY_FUNCTION__))
;
23680
23681 MVT VTOp0 = Op0.getSimpleValueType();
23682 (void)VTOp0;
23683 assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23684, __extension__ __PRETTY_FUNCTION__))
23684 "Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23684, __extension__ __PRETTY_FUNCTION__))
;
23685 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23686, __extension__ __PRETTY_FUNCTION__))
23686 "Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23686, __extension__ __PRETTY_FUNCTION__))
;
23687
23688 // The non-AVX512 code below works under the assumption that source and
23689 // destination types are the same.
23690 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23691, __extension__ __PRETTY_FUNCTION__))
23691 "Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23691, __extension__ __PRETTY_FUNCTION__))
;
23692
23693 // The result is boolean, but operands are int/float
23694 if (VT.getVectorElementType() == MVT::i1) {
23695 // In AVX-512 architecture setcc returns mask with i1 elements,
23696 // But there is no compare instruction for i8 and i16 elements in KNL.
23697 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23698, __extension__ __PRETTY_FUNCTION__))
23698 "Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23698, __extension__ __PRETTY_FUNCTION__))
;
23699 return LowerIntVSETCC_AVX512(Op, DAG);
23700 }
23701
23702 // Lower using XOP integer comparisons.
23703 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23704 // Translate compare code to XOP PCOM compare mode.
23705 unsigned CmpMode = 0;
23706 switch (Cond) {
23707 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23707)
;
23708 case ISD::SETULT:
23709 case ISD::SETLT: CmpMode = 0x00; break;
23710 case ISD::SETULE:
23711 case ISD::SETLE: CmpMode = 0x01; break;
23712 case ISD::SETUGT:
23713 case ISD::SETGT: CmpMode = 0x02; break;
23714 case ISD::SETUGE:
23715 case ISD::SETGE: CmpMode = 0x03; break;
23716 case ISD::SETEQ: CmpMode = 0x04; break;
23717 case ISD::SETNE: CmpMode = 0x05; break;
23718 }
23719
23720 // Are we comparing unsigned or signed integers?
23721 unsigned Opc =
23722 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
23723
23724 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23725 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23726 }
23727
23728 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23729 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23730 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
23731 SDValue BC0 = peekThroughBitcasts(Op0);
23732 if (BC0.getOpcode() == ISD::AND) {
23733 APInt UndefElts;
23734 SmallVector<APInt, 64> EltBits;
23735 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
23736 VT.getScalarSizeInBits(), UndefElts,
23737 EltBits, false, false)) {
23738 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23739 Cond = ISD::SETEQ;
23740 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23741 }
23742 }
23743 }
23744 }
23745
23746 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23747 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23748 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23749 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
23750 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23751 unsigned BitWidth = VT.getScalarSizeInBits();
23752 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23753
23754 SDValue Result = Op0.getOperand(0);
23755 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23756 DAG.getConstant(ShiftAmt, dl, VT));
23757 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23758 DAG.getConstant(BitWidth - 1, dl, VT));
23759 return Result;
23760 }
23761 }
23762
23763 // Break 256-bit integer vector compare into smaller ones.
23764 if (VT.is256BitVector() && !Subtarget.hasInt256())
23765 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23766
23767 if (VT == MVT::v32i16 || VT == MVT::v64i8) {
23768 assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!")(static_cast <bool> (!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!"
) ? void (0) : __assert_fail ("!Subtarget.hasBWI() && \"Unexpected VT with AVX512BW!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23768, __extension__ __PRETTY_FUNCTION__))
;
23769 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23770 }
23771
23772 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23773 // not-of-PCMPEQ:
23774 // X != INT_MIN --> X >s INT_MIN
23775 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23776 // +X != 0 --> +X >s 0
23777 APInt ConstValue;
23778 if (Cond == ISD::SETNE &&
23779 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23780 if (ConstValue.isMinSignedValue())
23781 Cond = ISD::SETGT;
23782 else if (ConstValue.isMaxSignedValue())
23783 Cond = ISD::SETLT;
23784 else if (ConstValue.isNullValue() && DAG.SignBitIsZero(Op0))
23785 Cond = ISD::SETGT;
23786 }
23787
23788 // If both operands are known non-negative, then an unsigned compare is the
23789 // same as a signed compare and there's no need to flip signbits.
23790 // TODO: We could check for more general simplifications here since we're
23791 // computing known bits.
23792 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23793 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23794
23795 // Special case: Use min/max operations for unsigned compares.
23796 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23797 if (ISD::isUnsignedIntSetCC(Cond) &&
23798 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23799 TLI.isOperationLegal(ISD::UMIN, VT)) {
23800 // If we have a constant operand, increment/decrement it and change the
23801 // condition to avoid an invert.
23802 if (Cond == ISD::SETUGT) {
23803 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23804 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
23805 Op1 = UGTOp1;
23806 Cond = ISD::SETUGE;
23807 }
23808 }
23809 if (Cond == ISD::SETULT) {
23810 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23811 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
23812 Op1 = ULTOp1;
23813 Cond = ISD::SETULE;
23814 }
23815 }
23816 bool Invert = false;
23817 unsigned Opc;
23818 switch (Cond) {
23819 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23819)
;
23820 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23821 case ISD::SETULE: Opc = ISD::UMIN; break;
23822 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
23823 case ISD::SETUGE: Opc = ISD::UMAX; break;
23824 }
23825
23826 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23827 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23828
23829 // If the logical-not of the result is required, perform that now.
23830 if (Invert)
23831 Result = DAG.getNOT(dl, Result, VT);
23832
23833 return Result;
23834 }
23835
23836 // Try to use SUBUS and PCMPEQ.
23837 if (FlipSigns)
23838 if (SDValue V =
23839 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23840 return V;
23841
23842 // We are handling one of the integer comparisons here. Since SSE only has
23843 // GT and EQ comparisons for integer, swapping operands and multiple
23844 // operations may be required for some comparisons.
23845 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23846 : X86ISD::PCMPGT;
23847 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23848 Cond == ISD::SETGE || Cond == ISD::SETUGE;
23849 bool Invert = Cond == ISD::SETNE ||
23850 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
23851
23852 if (Swap)
23853 std::swap(Op0, Op1);
23854
23855 // Check that the operation in question is available (most are plain SSE2,
23856 // but PCMPGTQ and PCMPEQQ have different requirements).
23857 if (VT == MVT::v2i64) {
23858 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23859 assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23859, __extension__ __PRETTY_FUNCTION__))
;
23860
23861 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23862 // the odd elements over the even elements.
23863 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23864 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23865 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23866
23867 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23868 static const int MaskHi[] = { 1, 1, 3, 3 };
23869 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23870
23871 return DAG.getBitcast(VT, Result);
23872 }
23873
23874 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23875 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23876 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23877
23878 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23879 static const int MaskHi[] = { 1, 1, 3, 3 };
23880 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23881
23882 return DAG.getBitcast(VT, Result);
23883 }
23884
23885 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23886 // bits of the inputs before performing those operations. The lower
23887 // compare is always unsigned.
23888 SDValue SB;
23889 if (FlipSigns) {
23890 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
23891 } else {
23892 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
23893 }
23894 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23895 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23896
23897 // Cast everything to the right type.
23898 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23899 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23900
23901 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23902 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23903 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23904
23905 // Create masks for only the low parts/high parts of the 64 bit integers.
23906 static const int MaskHi[] = { 1, 1, 3, 3 };
23907 static const int MaskLo[] = { 0, 0, 2, 2 };
23908 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23909 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23910 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23911
23912 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23913 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23914
23915 if (Invert)
23916 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23917
23918 return DAG.getBitcast(VT, Result);
23919 }
23920
23921 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23922 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23923 // pcmpeqd + pshufd + pand.
23924 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23924, __extension__ __PRETTY_FUNCTION__))
;
23925
23926 // First cast everything to the right type.
23927 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23928 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23929
23930 // Do the compare.
23931 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23932
23933 // Make sure the lower and upper halves are both all-ones.
23934 static const int Mask[] = { 1, 0, 3, 2 };
23935 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23936 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23937
23938 if (Invert)
23939 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23940
23941 return DAG.getBitcast(VT, Result);
23942 }
23943 }
23944
23945 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23946 // bits of the inputs before performing those operations.
23947 if (FlipSigns) {
23948 MVT EltVT = VT.getVectorElementType();
23949 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
23950 VT);
23951 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23952 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23953 }
23954
23955 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23956
23957 // If the logical-not of the result is required, perform that now.
23958 if (Invert)
23959 Result = DAG.getNOT(dl, Result, VT);
23960
23961 return Result;
23962}
23963
23964// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23965static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
23966 const SDLoc &dl, SelectionDAG &DAG,
23967 const X86Subtarget &Subtarget,
23968 SDValue &X86CC) {
23969 // Only support equality comparisons.
23970 if (CC != ISD::SETEQ && CC != ISD::SETNE)
23971 return SDValue();
23972
23973 // Must be a bitcast from vXi1.
23974 if (Op0.getOpcode() != ISD::BITCAST)
23975 return SDValue();
23976
23977 Op0 = Op0.getOperand(0);
23978 MVT VT = Op0.getSimpleValueType();
23979 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23980 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23981 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23982 return SDValue();
23983
23984 X86::CondCode X86Cond;
23985 if (isNullConstant(Op1)) {
23986 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23987 } else if (isAllOnesConstant(Op1)) {
23988 // C flag is set for all ones.
23989 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23990 } else
23991 return SDValue();
23992
23993 // If the input is an AND, we can combine it's operands into the KTEST.
23994 bool KTestable = false;
23995 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23996 KTestable = true;
23997 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23998 KTestable = true;
23999 if (!isNullConstant(Op1))
24000 KTestable = false;
24001 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24002 SDValue LHS = Op0.getOperand(0);
24003 SDValue RHS = Op0.getOperand(1);
24004 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24005 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24006 }
24007
24008 // If the input is an OR, we can combine it's operands into the KORTEST.
24009 SDValue LHS = Op0;
24010 SDValue RHS = Op0;
24011 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24012 LHS = Op0.getOperand(0);
24013 RHS = Op0.getOperand(1);
24014 }
24015
24016 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24017 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24018}
24019
24020/// Emit flags for the given setcc condition and operands. Also returns the
24021/// corresponding X86 condition code constant in X86CC.
24022SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24023 ISD::CondCode CC, const SDLoc &dl,
24024 SelectionDAG &DAG,
24025 SDValue &X86CC) const {
24026 // Optimize to BT if possible.
24027 // Lower (X & (1 << N)) == 0 to BT(X, N).
24028 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24029 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24030 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
24031 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24032 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
24033 return BT;
24034 }
24035
24036 // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
24037 // TODO: We could do AND tree with all 1s as well by using the C flag.
24038 if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
24039 if (SDValue CmpZ =
24040 MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
24041 return CmpZ;
24042
24043 // Try to lower using KORTEST or KTEST.
24044 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24045 return Test;
24046
24047 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
24048 // these.
24049 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
24050 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24051 // If the input is a setcc, then reuse the input setcc or use a new one with
24052 // the inverted condition.
24053 if (Op0.getOpcode() == X86ISD::SETCC) {
24054 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24055
24056 X86CC = Op0.getOperand(0);
24057 if (Invert) {
24058 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24059 CCode = X86::GetOppositeBranchCondition(CCode);
24060 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
24061 }
24062
24063 return Op0.getOperand(1);
24064 }
24065 }
24066
24067 // Try to use the carry flag from the add in place of an separate CMP for:
24068 // (seteq (add X, -1), -1). Similar for setne.
24069 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24070 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24071 if (isProfitableToUseFlagOp(Op0)) {
24072 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24073
24074 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24075 Op0.getOperand(1));
24076 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24077 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24078 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
24079 return SDValue(New.getNode(), 1);
24080 }
24081 }
24082
24083 X86::CondCode CondCode =
24084 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24085 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24085, __extension__ __PRETTY_FUNCTION__))
;
24086
24087 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24088 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24089 return EFLAGS;
24090}
24091
24092SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24093
24094 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24095 Op.getOpcode() == ISD::STRICT_FSETCCS;
24096 MVT VT = Op->getSimpleValueType(0);
24097
24098 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24099
24100 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24100, __extension__ __PRETTY_FUNCTION__))
;
24101 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24102 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24103 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24104 SDLoc dl(Op);
24105 ISD::CondCode CC =
24106 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24107
24108 // Handle f128 first, since one possible outcome is a normal integer
24109 // comparison which gets handled by emitFlagsForSetcc.
24110 if (Op0.getValueType() == MVT::f128) {
24111 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24112 Op.getOpcode() == ISD::STRICT_FSETCCS);
24113
24114 // If softenSetCCOperands returned a scalar, use it.
24115 if (!Op1.getNode()) {
24116 assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24117, __extension__ __PRETTY_FUNCTION__))
24117 "Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24117, __extension__ __PRETTY_FUNCTION__))
;
24118 if (IsStrict)
24119 return DAG.getMergeValues({Op0, Chain}, dl);
24120 return Op0;
24121 }
24122 }
24123
24124 if (Op0.getSimpleValueType().isInteger()) {
24125 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24126 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24127 // this may translate to less uops depending on uarch implementation. The
24128 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24129 // canonicalize to that CondCode.
24130 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24131 // encoding size - so it must either already be a i8 or i32 immediate, or it
24132 // shrinks down to that. We don't do this for any i64's to avoid additional
24133 // constant materializations.
24134 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24135 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24136 const APInt &Op1Val = Op1C->getAPIntValue();
24137 if (!Op1Val.isNullValue()) {
24138 // Ensure the constant+1 doesn't overflow.
24139 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24140 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24141 APInt Op1ValPlusOne = Op1Val + 1;
24142 if (Op1ValPlusOne.isSignedIntN(32) &&
24143 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24144 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24145 CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE
24146 : ISD::CondCode::SETUGE;
24147 }
24148 }
24149 }
24150 }
24151
24152 SDValue X86CC;
24153 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24154 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24155 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24156 }
24157
24158 // Handle floating point.
24159 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24160 if (CondCode == X86::COND_INVALID)
24161 return SDValue();
24162
24163 SDValue EFLAGS;
24164 if (IsStrict) {
24165 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24166 EFLAGS =
24167 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
24168 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24169 Chain = EFLAGS.getValue(1);
24170 } else {
24171 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24172 }
24173
24174 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24175 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24176 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24177}
24178
24179SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24180 SDValue LHS = Op.getOperand(0);
24181 SDValue RHS = Op.getOperand(1);
24182 SDValue Carry = Op.getOperand(2);
24183 SDValue Cond = Op.getOperand(3);
24184 SDLoc DL(Op);
24185
24186 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24186, __extension__ __PRETTY_FUNCTION__))
;
24187 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
24188
24189 // Recreate the carry if needed.
24190 EVT CarryVT = Carry.getValueType();
24191 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24192 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24193
24194 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24195 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24196 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24197}
24198
24199// This function returns three things: the arithmetic computation itself
24200// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24201// flag and the condition code define the case in which the arithmetic
24202// computation overflows.
24203static std::pair<SDValue, SDValue>
24204getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
24205 assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24205, __extension__ __PRETTY_FUNCTION__))
;
24206 SDValue Value, Overflow;
24207 SDValue LHS = Op.getOperand(0);
24208 SDValue RHS = Op.getOperand(1);
24209 unsigned BaseOp = 0;
24210 SDLoc DL(Op);
24211 switch (Op.getOpcode()) {
24212 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24212)
;
24213 case ISD::SADDO:
24214 BaseOp = X86ISD::ADD;
24215 Cond = X86::COND_O;
24216 break;
24217 case ISD::UADDO:
24218 BaseOp = X86ISD::ADD;
24219 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
24220 break;
24221 case ISD::SSUBO:
24222 BaseOp = X86ISD::SUB;
24223 Cond = X86::COND_O;
24224 break;
24225 case ISD::USUBO:
24226 BaseOp = X86ISD::SUB;
24227 Cond = X86::COND_B;
24228 break;
24229 case ISD::SMULO:
24230 BaseOp = X86ISD::SMUL;
24231 Cond = X86::COND_O;
24232 break;
24233 case ISD::UMULO:
24234 BaseOp = X86ISD::UMUL;
24235 Cond = X86::COND_O;
24236 break;
24237 }
24238
24239 if (BaseOp) {
24240 // Also sets EFLAGS.
24241 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24242 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24243 Overflow = Value.getValue(1);
24244 }
24245
24246 return std::make_pair(Value, Overflow);
24247}
24248
24249static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
24250 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24251 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24252 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24253 // has only one use.
24254 SDLoc DL(Op);
24255 X86::CondCode Cond;
24256 SDValue Value, Overflow;
24257 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24258
24259 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24260 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24260, __extension__ __PRETTY_FUNCTION__))
;
24261 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24262}
24263
24264/// Return true if opcode is a X86 logical comparison.
24265static bool isX86LogicalCmp(SDValue Op) {
24266 unsigned Opc = Op.getOpcode();
24267 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24268 Opc == X86ISD::FCMP)
24269 return true;
24270 if (Op.getResNo() == 1 &&
24271 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24272 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
24273 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24274 return true;
24275
24276 return false;
24277}
24278
24279static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
24280 if (V.getOpcode() != ISD::TRUNCATE)
24281 return false;
24282
24283 SDValue VOp0 = V.getOperand(0);
24284 unsigned InBits = VOp0.getValueSizeInBits();
24285 unsigned Bits = V.getValueSizeInBits();
24286 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24287}
24288
24289SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24290 bool AddTest = true;
24291 SDValue Cond = Op.getOperand(0);
24292 SDValue Op1 = Op.getOperand(1);
24293 SDValue Op2 = Op.getOperand(2);
24294 SDLoc DL(Op);
24295 MVT VT = Op1.getSimpleValueType();
24296 SDValue CC;
24297
24298 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24299 // are available or VBLENDV if AVX is available.
24300 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24301 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24302 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24303 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24304 bool IsAlwaysSignaling;
24305 unsigned SSECC =
24306 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24307 CondOp0, CondOp1, IsAlwaysSignaling);
24308
24309 if (Subtarget.hasAVX512()) {
24310 SDValue Cmp =
24311 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24312 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24313 assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24313, __extension__ __PRETTY_FUNCTION__))
;
24314 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24315 }
24316
24317 if (SSECC < 8 || Subtarget.hasAVX()) {
24318 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24319 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24320
24321 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24322 // of 3 logic instructions for size savings and potentially speed.
24323 // Unfortunately, there is no scalar form of VBLENDV.
24324
24325 // If either operand is a +0.0 constant, don't try this. We can expect to
24326 // optimize away at least one of the logic instructions later in that
24327 // case, so that sequence would be faster than a variable blend.
24328
24329 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24330 // uses XMM0 as the selection register. That may need just as many
24331 // instructions as the AND/ANDN/OR sequence due to register moves, so
24332 // don't bother.
24333 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24334 !isNullFPConstant(Op2)) {
24335 // Convert to vectors, do a VSELECT, and convert back to scalar.
24336 // All of the conversions should be optimized away.
24337 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24338 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24339 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24340 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24341
24342 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24343 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24344
24345 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24346
24347 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24348 VSel, DAG.getIntPtrConstant(0, DL));
24349 }
24350 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24351 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24352 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24353 }
24354 }
24355
24356 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24357 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24358 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24359 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24360 }
24361
24362 if (Cond.getOpcode() == ISD::SETCC) {
24363 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24364 Cond = NewCond;
24365 // If the condition was updated, it's possible that the operands of the
24366 // select were also updated (for example, EmitTest has a RAUW). Refresh
24367 // the local references to the select operands in case they got stale.
24368 Op1 = Op.getOperand(1);
24369 Op2 = Op.getOperand(2);
24370 }
24371 }
24372
24373 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24374 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24375 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24376 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24377 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24378 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24379 if (Cond.getOpcode() == X86ISD::SETCC &&
24380 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24381 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24382 SDValue Cmp = Cond.getOperand(1);
24383 SDValue CmpOp0 = Cmp.getOperand(0);
24384 unsigned CondCode = Cond.getConstantOperandVal(0);
24385
24386 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24387 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24388 // handle to keep the CMP with 0. This should be removed by
24389 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24390 // cttz_zero_undef.
24391 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24392 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24393 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24394 };
24395 if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
24396 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24397 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24398 // Keep Cmp.
24399 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24400 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24401 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24402
24403 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24404 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24405
24406 // Apply further optimizations for special cases
24407 // (select (x != 0), -1, 0) -> neg & sbb
24408 // (select (x == 0), 0, -1) -> neg & sbb
24409 if (isNullConstant(Y) &&
24410 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
24411 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24412 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24413 Zero = DAG.getConstant(0, DL, Op.getValueType());
24414 return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
24415 }
24416
24417 Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
24418 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
24419
24420 SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
24421 SDValue Res = // Res = 0 or -1.
24422 DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
24423
24424 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
24425 Res = DAG.getNOT(DL, Res, Res.getValueType());
24426
24427 return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
24428 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
24429 Cmp.getOperand(0).getOpcode() == ISD::AND &&
24430 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
24431 SDValue Src1, Src2;
24432 // true if Op2 is XOR or OR operator and one of its operands
24433 // is equal to Op1
24434 // ( a , a op b) || ( b , a op b)
24435 auto isOrXorPattern = [&]() {
24436 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24437 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24438 Src1 =
24439 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24440 Src2 = Op1;
24441 return true;
24442 }
24443 return false;
24444 };
24445
24446 if (isOrXorPattern()) {
24447 SDValue Neg;
24448 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24449 // we need mask of all zeros or ones with same size of the other
24450 // operands.
24451 if (CmpSz > VT.getSizeInBits())
24452 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24453 else if (CmpSz < VT.getSizeInBits())
24454 Neg = DAG.getNode(ISD::AND, DL, VT,
24455 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24456 DAG.getConstant(1, DL, VT));
24457 else
24458 Neg = CmpOp0;
24459 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
24460 Neg); // -(and (x, 0x1))
24461 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24462 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
24463 }
24464 }
24465 }
24466
24467 // Look past (and (setcc_carry (cmp ...)), 1).
24468 if (Cond.getOpcode() == ISD::AND &&
24469 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24470 isOneConstant(Cond.getOperand(1)))
24471 Cond = Cond.getOperand(0);
24472
24473 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24474 // setting operand in place of the X86ISD::SETCC.
24475 unsigned CondOpcode = Cond.getOpcode();
24476 if (CondOpcode == X86ISD::SETCC ||
24477 CondOpcode == X86ISD::SETCC_CARRY) {
24478 CC = Cond.getOperand(0);
24479
24480 SDValue Cmp = Cond.getOperand(1);
24481 bool IllegalFPCMov = false;
24482 if (VT.isFloatingPoint() && !VT.isVector() &&
24483 !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
24484 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24485
24486 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24487 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24488 Cond = Cmp;
24489 AddTest = false;
24490 }
24491 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24492 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24493 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24494 SDValue Value;
24495 X86::CondCode X86Cond;
24496 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24497
24498 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24499 AddTest = false;
24500 }
24501
24502 if (AddTest) {
24503 // Look past the truncate if the high bits are known zero.
24504 if (isTruncWithZeroHighBitsInput(Cond, DAG))
24505 Cond = Cond.getOperand(0);
24506
24507 // We know the result of AND is compared against zero. Try to match
24508 // it to BT.
24509 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24510 SDValue BTCC;
24511 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
24512 CC = BTCC;
24513 Cond = BT;
24514 AddTest = false;
24515 }
24516 }
24517 }
24518
24519 if (AddTest) {
24520 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24521 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24522 }
24523
24524 // a < b ? -1 : 0 -> RES = ~setcc_carry
24525 // a < b ? 0 : -1 -> RES = setcc_carry
24526 // a >= b ? -1 : 0 -> RES = setcc_carry
24527 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24528 if (Cond.getOpcode() == X86ISD::SUB) {
24529 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
24530
24531 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24532 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24533 (isNullConstant(Op1) || isNullConstant(Op2))) {
24534 SDValue Res =
24535 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24536 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24537 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24538 return DAG.getNOT(DL, Res, Res.getValueType());
24539 return Res;
24540 }
24541 }
24542
24543 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24544 // widen the cmov and push the truncate through. This avoids introducing a new
24545 // branch during isel and doesn't add any extensions.
24546 if (Op.getValueType() == MVT::i8 &&
24547 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24548 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24549 if (T1.getValueType() == T2.getValueType() &&
24550 // Exclude CopyFromReg to avoid partial register stalls.
24551 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24552 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24553 CC, Cond);
24554 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24555 }
24556 }
24557
24558 // Or finally, promote i8 cmovs if we have CMOV,
24559 // or i16 cmovs if it won't prevent folding a load.
24560 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24561 // legal, but EmitLoweredSelect() can not deal with these extensions
24562 // being inserted between two CMOV's. (in i16 case too TBN)
24563 // https://bugs.llvm.org/show_bug.cgi?id=40974
24564 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
24565 (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
24566 !MayFoldLoad(Op2))) {
24567 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24568 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24569 SDValue Ops[] = { Op2, Op1, CC, Cond };
24570 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24571 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24572 }
24573
24574 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24575 // condition is true.
24576 SDValue Ops[] = { Op2, Op1, CC, Cond };
24577 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
24578}
24579
24580static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
24581 const X86Subtarget &Subtarget,
24582 SelectionDAG &DAG) {
24583 MVT VT = Op->getSimpleValueType(0);
24584 SDValue In = Op->getOperand(0);
24585 MVT InVT = In.getSimpleValueType();
24586 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24586, __extension__ __PRETTY_FUNCTION__))
;
24587 MVT VTElt = VT.getVectorElementType();
24588 SDLoc dl(Op);
24589
24590 unsigned NumElts = VT.getVectorNumElements();
24591
24592 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24593 MVT ExtVT = VT;
24594 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24595 // If v16i32 is to be avoided, we'll need to split and concatenate.
24596 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24597 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24598
24599 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24600 }
24601
24602 // Widen to 512-bits if VLX is not supported.
24603 MVT WideVT = ExtVT;
24604 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24605 NumElts *= 512 / ExtVT.getSizeInBits();
24606 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24607 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24608 In, DAG.getIntPtrConstant(0, dl));
24609 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24610 }
24611
24612 SDValue V;
24613 MVT WideEltVT = WideVT.getVectorElementType();
24614 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24615 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24616 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24617 } else {
24618 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24619 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24620 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24621 }
24622
24623 // Truncate if we had to extend i16/i8 above.
24624 if (VT != ExtVT) {
24625 WideVT = MVT::getVectorVT(VTElt, NumElts);
24626 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24627 }
24628
24629 // Extract back to 128/256-bit if we widened.
24630 if (WideVT != VT)
24631 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24632 DAG.getIntPtrConstant(0, dl));
24633
24634 return V;
24635}
24636
24637static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24638 SelectionDAG &DAG) {
24639 SDValue In = Op->getOperand(0);
24640 MVT InVT = In.getSimpleValueType();
24641
24642 if (InVT.getVectorElementType() == MVT::i1)
24643 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24644
24645 assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24645, __extension__ __PRETTY_FUNCTION__))
;
24646 return LowerAVXExtend(Op, DAG, Subtarget);
24647}
24648
24649// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24650// For sign extend this needs to handle all vector sizes and SSE4.1 and
24651// non-SSE4.1 targets. For zero extend this should only handle inputs of
24652// MVT::v64i8 when BWI is not supported, but AVX512 is.
24653static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
24654 const X86Subtarget &Subtarget,
24655 SelectionDAG &DAG) {
24656 SDValue In = Op->getOperand(0);
24657 MVT VT = Op->getSimpleValueType(0);
24658 MVT InVT = In.getSimpleValueType();
24659
24660 MVT SVT = VT.getVectorElementType();
24661 MVT InSVT = InVT.getVectorElementType();
24662 assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24662, __extension__ __PRETTY_FUNCTION__))
;
24663
24664 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24665 return SDValue();
24666 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24667 return SDValue();
24668 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24669 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24670 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24671 return SDValue();
24672
24673 SDLoc dl(Op);
24674 unsigned Opc = Op.getOpcode();
24675 unsigned NumElts = VT.getVectorNumElements();
24676
24677 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24678 // For 512-bit vectors, we need 128-bits or 256-bits.
24679 if (InVT.getSizeInBits() > 128) {
24680 // Input needs to be at least the same number of elements as output, and
24681 // at least 128-bits.
24682 int InSize = InSVT.getSizeInBits() * NumElts;
24683 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24684 InVT = In.getSimpleValueType();
24685 }
24686
24687 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24688 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24689 // need to be handled here for 256/512-bit results.
24690 if (Subtarget.hasInt256()) {
24691 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24691, __extension__ __PRETTY_FUNCTION__))
;
24692
24693 if (InVT.getVectorNumElements() != NumElts)
24694 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24695
24696 // FIXME: Apparently we create inreg operations that could be regular
24697 // extends.
24698 unsigned ExtOpc =
24699 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
24700 : ISD::ZERO_EXTEND;
24701 return DAG.getNode(ExtOpc, dl, VT, In);
24702 }
24703
24704 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24705 if (Subtarget.hasAVX()) {
24706 assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24706, __extension__ __PRETTY_FUNCTION__))
;
24707 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24708 int HalfNumElts = HalfVT.getVectorNumElements();
24709
24710 unsigned NumSrcElts = InVT.getVectorNumElements();
24711 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24712 for (int i = 0; i != HalfNumElts; ++i)
24713 HiMask[i] = HalfNumElts + i;
24714
24715 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24716 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24717 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24718 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24719 }
24720
24721 // We should only get here for sign extend.
24722 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24722, __extension__ __PRETTY_FUNCTION__))
;
24723 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24723, __extension__ __PRETTY_FUNCTION__))
;
24724
24725 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24726 SDValue Curr = In;
24727 SDValue SignExt = Curr;
24728
24729 // As SRAI is only available on i16/i32 types, we expand only up to i32
24730 // and handle i64 separately.
24731 if (InVT != MVT::v4i32) {
24732 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24733
24734 unsigned DestWidth = DestVT.getScalarSizeInBits();
24735 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24736
24737 unsigned InNumElts = InVT.getVectorNumElements();
24738 unsigned DestElts = DestVT.getVectorNumElements();
24739
24740 // Build a shuffle mask that takes each input element and places it in the
24741 // MSBs of the new element size.
24742 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24743 for (unsigned i = 0; i != DestElts; ++i)
24744 Mask[i * Scale + (Scale - 1)] = i;
24745
24746 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24747 Curr = DAG.getBitcast(DestVT, Curr);
24748
24749 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24750 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24751 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24752 }
24753
24754 if (VT == MVT::v2i64) {
24755 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24755, __extension__ __PRETTY_FUNCTION__))
;
24756 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24757 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24758 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24759 SignExt = DAG.getBitcast(VT, SignExt);
24760 }
24761
24762 return SignExt;
24763}
24764
24765static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
24766 SelectionDAG &DAG) {
24767 MVT VT = Op->getSimpleValueType(0);
24768 SDValue In = Op->getOperand(0);
24769 MVT InVT = In.getSimpleValueType();
24770 SDLoc dl(Op);
24771
24772 if (InVT.getVectorElementType() == MVT::i1)
24773 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24774
24775 assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24775, __extension__ __PRETTY_FUNCTION__))
;
24776 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24777, __extension__ __PRETTY_FUNCTION__))
24777 "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24777, __extension__ __PRETTY_FUNCTION__))
;
24778 assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24781, __extension__ __PRETTY_FUNCTION__))
24779 VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24781, __extension__ __PRETTY_FUNCTION__))
24780 VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24781, __extension__ __PRETTY_FUNCTION__))
24781 "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24781, __extension__ __PRETTY_FUNCTION__))
;
24782 assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24785, __extension__ __PRETTY_FUNCTION__))
24783 InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24785, __extension__ __PRETTY_FUNCTION__))
24784 InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24785, __extension__ __PRETTY_FUNCTION__))
24785 "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24785, __extension__ __PRETTY_FUNCTION__))
;
24786
24787 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24788 assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24788, __extension__ __PRETTY_FUNCTION__))
;
24789 return splitVectorIntUnary(Op, DAG);
24790 }
24791
24792 if (Subtarget.hasInt256())
24793 return Op;
24794
24795 // Optimize vectors in AVX mode
24796 // Sign extend v8i16 to v8i32 and
24797 // v4i32 to v4i64
24798 //
24799 // Divide input vector into two parts
24800 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24801 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24802 // concat the vectors to original VT
24803 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24804 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24805
24806 unsigned NumElems = InVT.getVectorNumElements();
24807 SmallVector<int,8> ShufMask(NumElems, -1);
24808 for (unsigned i = 0; i != NumElems/2; ++i)
24809 ShufMask[i] = i + NumElems/2;
24810
24811 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24812 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24813
24814 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24815}
24816
24817/// Change a vector store into a pair of half-size vector stores.
24818static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
24819 SDValue StoredVal = Store->getValue();
24820 assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24822, __extension__ __PRETTY_FUNCTION__))
24821 StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24822, __extension__ __PRETTY_FUNCTION__))
24822 "Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24822, __extension__ __PRETTY_FUNCTION__))
;
24823
24824 // Splitting volatile memory ops is not allowed unless the operation was not
24825 // legal to begin with. Assume the input store is legal (this transform is
24826 // only used for targets with AVX). Note: It is possible that we have an
24827 // illegal type like v2i128, and so we could allow splitting a volatile store
24828 // in that case if that is important.
24829 if (!Store->isSimple())
24830 return SDValue();
24831
24832 SDLoc DL(Store);
24833 SDValue Value0, Value1;
24834 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24835 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24836 SDValue Ptr0 = Store->getBasePtr();
24837 SDValue Ptr1 =
24838 DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);
24839 SDValue Ch0 =
24840 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24841 Store->getOriginalAlign(),
24842 Store->getMemOperand()->getFlags());
24843 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24844 Store->getPointerInfo().getWithOffset(HalfOffset),
24845 Store->getOriginalAlign(),
24846 Store->getMemOperand()->getFlags());
24847 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24848}
24849
24850/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24851/// type.
24852static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
24853 SelectionDAG &DAG) {
24854 SDValue StoredVal = Store->getValue();
24855 assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24856, __extension__ __PRETTY_FUNCTION__))
24856 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24856, __extension__ __PRETTY_FUNCTION__))
;
24857 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24858
24859 // Splitting volatile memory ops is not allowed unless the operation was not
24860 // legal to begin with. We are assuming the input op is legal (this transform
24861 // is only used for targets with AVX).
24862 if (!Store->isSimple())
24863 return SDValue();
24864
24865 MVT StoreSVT = StoreVT.getScalarType();
24866 unsigned NumElems = StoreVT.getVectorNumElements();
24867 unsigned ScalarSize = StoreSVT.getStoreSize();
24868
24869 SDLoc DL(Store);
24870 SmallVector<SDValue, 4> Stores;
24871 for (unsigned i = 0; i != NumElems; ++i) {
24872 unsigned Offset = i * ScalarSize;
24873 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24874 TypeSize::Fixed(Offset), DL);
24875 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24876 DAG.getIntPtrConstant(i, DL));
24877 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24878 Store->getPointerInfo().getWithOffset(Offset),
24879 Store->getOriginalAlign(),
24880 Store->getMemOperand()->getFlags());
24881 Stores.push_back(Ch);
24882 }
24883 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24884}
24885
24886static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24887 SelectionDAG &DAG) {
24888 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24889 SDLoc dl(St);
24890 SDValue StoredVal = St->getValue();
24891
24892 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24893 if (StoredVal.getValueType().isVector() &&
24894 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24895 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24896 assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24896, __extension__ __PRETTY_FUNCTION__))
;
24897 assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24897, __extension__ __PRETTY_FUNCTION__))
;
24898 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24899, __extension__ __PRETTY_FUNCTION__))
24899 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24899, __extension__ __PRETTY_FUNCTION__))
;
24900
24901 // We must pad with zeros to ensure we store zeroes to any unused bits.
24902 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24903 DAG.getUNDEF(MVT::v16i1), StoredVal,
24904 DAG.getIntPtrConstant(0, dl));
24905 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24906 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24907 // Make sure we store zeros in the extra bits.
24908 if (NumElts < 8)
24909 StoredVal = DAG.getZeroExtendInReg(
24910 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24911
24912 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24913 St->getPointerInfo(), St->getOriginalAlign(),
24914 St->getMemOperand()->getFlags());
24915 }
24916
24917 if (St->isTruncatingStore())
24918 return SDValue();
24919
24920 // If this is a 256-bit store of concatenated ops, we are better off splitting
24921 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24922 // and each half can execute independently. Some cores would split the op into
24923 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24924 MVT StoreVT = StoredVal.getSimpleValueType();
24925 if (StoreVT.is256BitVector() ||
24926 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24927 !Subtarget.hasBWI())) {
24928 SmallVector<SDValue, 4> CatOps;
24929 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
24930 return splitVectorStore(St, DAG);
24931 return SDValue();
24932 }
24933
24934 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24935 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&(static_cast <bool> (StoreVT.isVector() && StoreVT
.getSizeInBits() == 64 && "Unexpected VT") ? void (0)
: __assert_fail ("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24936, __extension__ __PRETTY_FUNCTION__))
24936 "Unexpected VT")(static_cast <bool> (StoreVT.isVector() && StoreVT
.getSizeInBits() == 64 && "Unexpected VT") ? void (0)
: __assert_fail ("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24936, __extension__ __PRETTY_FUNCTION__))
;
24937 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24938, __extension__ __PRETTY_FUNCTION__))
24938 TargetLowering::TypeWidenVector && "Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24938, __extension__ __PRETTY_FUNCTION__))
;
24939
24940 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24941 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24942 DAG.getUNDEF(StoreVT));
24943
24944 if (Subtarget.hasSSE2()) {
24945 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24946 // and store it.
24947 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24948 MVT CastVT = MVT::getVectorVT(StVT, 2);
24949 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24950 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24951 DAG.getIntPtrConstant(0, dl));
24952
24953 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24954 St->getPointerInfo(), St->getOriginalAlign(),
24955 St->getMemOperand()->getFlags());
24956 }
24957 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24957, __extension__ __PRETTY_FUNCTION__))
;
24958 SDVTList Tys = DAG.getVTList(MVT::Other);
24959 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24960 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24961 St->getMemOperand());
24962}
24963
24964// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24965// may emit an illegal shuffle but the expansion is still better than scalar
24966// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24967// we'll emit a shuffle and a arithmetic shift.
24968// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24969// TODO: It is possible to support ZExt by zeroing the undef values during
24970// the shuffle phase or after the shuffle.
24971static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24972 SelectionDAG &DAG) {
24973 MVT RegVT = Op.getSimpleValueType();
24974 assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24974, __extension__ __PRETTY_FUNCTION__))
;
24975 assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24976, __extension__ __PRETTY_FUNCTION__))
24976 "We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24976, __extension__ __PRETTY_FUNCTION__))
;
24977
24978 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24979 SDLoc dl(Ld);
24980
24981 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24982 if (RegVT.getVectorElementType() == MVT::i1) {
24983 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24983, __extension__ __PRETTY_FUNCTION__))
;
24984 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24984, __extension__ __PRETTY_FUNCTION__))
;
24985 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24986, __extension__ __PRETTY_FUNCTION__))
24986 "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24986, __extension__ __PRETTY_FUNCTION__))
;
24987
24988 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24989 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24990 Ld->getMemOperand()->getFlags());
24991
24992 // Replace chain users with the new chain.
24993 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24993, __extension__ __PRETTY_FUNCTION__))
;
24994
24995 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24996 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24997 DAG.getBitcast(MVT::v16i1, Val),
24998 DAG.getIntPtrConstant(0, dl));
24999 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25000 }
25001
25002 return SDValue();
25003}
25004
25005/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25006/// each of which has no other use apart from the AND / OR.
25007static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25008 Opc = Op.getOpcode();
25009 if (Opc != ISD::OR && Opc != ISD::AND)
25010 return false;
25011 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25012 Op.getOperand(0).hasOneUse() &&
25013 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25014 Op.getOperand(1).hasOneUse());
25015}
25016
25017SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25018 SDValue Chain = Op.getOperand(0);
25019 SDValue Cond = Op.getOperand(1);
25020 SDValue Dest = Op.getOperand(2);
25021 SDLoc dl(Op);
25022
25023 if (Cond.getOpcode() == ISD::SETCC &&
25024 Cond.getOperand(0).getValueType() != MVT::f128) {
25025 SDValue LHS = Cond.getOperand(0);
25026 SDValue RHS = Cond.getOperand(1);
25027 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25028
25029 // Special case for
25030 // setcc([su]{add,sub,mul}o == 0)
25031 // setcc([su]{add,sub,mul}o != 1)
25032 if (ISD::isOverflowIntrOpRes(LHS) &&
25033 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25034 (isNullConstant(RHS) || isOneConstant(RHS))) {
25035 SDValue Value, Overflow;
25036 X86::CondCode X86Cond;
25037 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25038
25039 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25040 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25041
25042 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25043 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25044 Overflow);
25045 }
25046
25047 if (LHS.getSimpleValueType().isInteger()) {
25048 SDValue CCVal;
25049 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25050 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25051 EFLAGS);
25052 }
25053
25054 if (CC == ISD::SETOEQ) {
25055 // For FCMP_OEQ, we can emit
25056 // two branches instead of an explicit AND instruction with a
25057 // separate test. However, we only do this if this block doesn't
25058 // have a fall-through edge, because this requires an explicit
25059 // jmp when the condition is false.
25060 if (Op.getNode()->hasOneUse()) {
25061 SDNode *User = *Op.getNode()->use_begin();
25062 // Look for an unconditional branch following this conditional branch.
25063 // We need this because we need to reverse the successors in order
25064 // to implement FCMP_OEQ.
25065 if (User->getOpcode() == ISD::BR) {
25066 SDValue FalseBB = User->getOperand(1);
25067 SDNode *NewBR =
25068 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25069 assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25069, __extension__ __PRETTY_FUNCTION__))
;
25070 (void)NewBR;
25071 Dest = FalseBB;
25072
25073 SDValue Cmp =
25074 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25075 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25076 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25077 CCVal, Cmp);
25078 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25079 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25080 Cmp);
25081 }
25082 }
25083 } else if (CC == ISD::SETUNE) {
25084 // For FCMP_UNE, we can emit
25085 // two branches instead of an explicit OR instruction with a
25086 // separate test.
25087 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25088 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25089 Chain =
25090 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
25091 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25092 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25093 Cmp);
25094 } else {
25095 X86::CondCode X86Cond =
25096 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25097 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25098 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25099 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25100 Cmp);
25101 }
25102 }
25103
25104 if (ISD::isOverflowIntrOpRes(Cond)) {
25105 SDValue Value, Overflow;
25106 X86::CondCode X86Cond;
25107 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25108
25109 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25110 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25111 Overflow);
25112 }
25113
25114 // Look past the truncate if the high bits are known zero.
25115 if (isTruncWithZeroHighBitsInput(Cond, DAG))
25116 Cond = Cond.getOperand(0);
25117
25118 EVT CondVT = Cond.getValueType();
25119
25120 // Add an AND with 1 if we don't already have one.
25121 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25122 Cond =
25123 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25124
25125 SDValue LHS = Cond;
25126 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25127
25128 SDValue CCVal;
25129 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25130 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25131 EFLAGS);
25132}
25133
25134// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25135// Calls to _alloca are needed to probe the stack when allocating more than 4k
25136// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25137// that the guard pages used by the OS virtual memory manager are allocated in
25138// correct sequence.
25139SDValue
25140X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25141 SelectionDAG &DAG) const {
25142 MachineFunction &MF = DAG.getMachineFunction();
25143 bool SplitStack = MF.shouldSplitStack();
25144 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25145 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25146 SplitStack || EmitStackProbeCall;
25147 SDLoc dl(Op);
25148
25149 // Get the inputs.
25150 SDNode *Node = Op.getNode();
25151 SDValue Chain = Op.getOperand(0);
25152 SDValue Size = Op.getOperand(1);
25153 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25154 EVT VT = Node->getValueType(0);
25155
25156 // Chain the dynamic stack allocation so that it doesn't modify the stack
25157 // pointer when other instructions are using the stack.
25158 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25159
25160 bool Is64Bit = Subtarget.is64Bit();
25161 MVT SPTy = getPointerTy(DAG.getDataLayout());
25162
25163 SDValue Result;
25164 if (!Lower) {
25165 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25166 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
25167 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25168, __extension__ __PRETTY_FUNCTION__))
25168 " not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25168, __extension__ __PRETTY_FUNCTION__))
;
25169
25170 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25171 const Align StackAlign = TFI.getStackAlign();
25172 if (hasInlineStackProbe(MF)) {
25173 MachineRegisterInfo &MRI = MF.getRegInfo();
25174
25175 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25176 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25177 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25178 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
25179 DAG.getRegister(Vreg, SPTy));
25180 } else {
25181 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25182 Chain = SP.getValue(1);
25183 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25184 }
25185 if (Alignment && *Alignment > StackAlign)
25186 Result =
25187 DAG.getNode(ISD::AND, dl, VT, Result,
25188 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25189 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25190 } else if (SplitStack) {
25191 MachineRegisterInfo &MRI = MF.getRegInfo();
25192
25193 if (Is64Bit) {
25194 // The 64 bit implementation of segmented stacks needs to clobber both r10
25195 // r11. This makes it impossible to use it along with nested parameters.
25196 const Function &F = MF.getFunction();
25197 for (const auto &A : F.args()) {
25198 if (A.hasNestAttr())
25199 report_fatal_error("Cannot use segmented stacks with functions that "
25200 "have nested arguments.");
25201 }
25202 }
25203
25204 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25205 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
25206 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
25207 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
25208 DAG.getRegister(Vreg, SPTy));
25209 } else {
25210 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25211 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
25212 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
25213
25214 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25215 Register SPReg = RegInfo->getStackRegister();
25216 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25217 Chain = SP.getValue(1);
25218
25219 if (Alignment) {
25220 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
25221 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
25222 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25223 }
25224
25225 Result = SP;
25226 }
25227
25228 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
25229 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
25230
25231 SDValue Ops[2] = {Result, Chain};
25232 return DAG.getMergeValues(Ops, dl);
25233}
25234
25235SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25236 MachineFunction &MF = DAG.getMachineFunction();
25237 auto PtrVT = getPointerTy(MF.getDataLayout());
25238 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25239
25240 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25241 SDLoc DL(Op);
25242
25243 if (!Subtarget.is64Bit() ||
25244 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25245 // vastart just stores the address of the VarArgsFrameIndex slot into the
25246 // memory location argument.
25247 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25248 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
25249 MachinePointerInfo(SV));
25250 }
25251
25252 // __va_list_tag:
25253 // gp_offset (0 - 6 * 8)
25254 // fp_offset (48 - 48 + 8 * 16)
25255 // overflow_arg_area (point to parameters coming in memory).
25256 // reg_save_area
25257 SmallVector<SDValue, 8> MemOps;
25258 SDValue FIN = Op.getOperand(1);
25259 // Store gp_offset
25260 SDValue Store = DAG.getStore(
25261 Op.getOperand(0), DL,
25262 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25263 MachinePointerInfo(SV));
25264 MemOps.push_back(Store);
25265
25266 // Store fp_offset
25267 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);
25268 Store = DAG.getStore(
25269 Op.getOperand(0), DL,
25270 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25271 MachinePointerInfo(SV, 4));
25272 MemOps.push_back(Store);
25273
25274 // Store ptr to overflow_arg_area
25275 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25276 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25277 Store =
25278 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25279 MemOps.push_back(Store);
25280
25281 // Store ptr to reg_save_area.
25282 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25283 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25284 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25285 Store = DAG.getStore(
25286 Op.getOperand(0), DL, RSFIN, FIN,
25287 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25288 MemOps.push_back(Store);
25289 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25290}
25291
25292SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25293 assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25294, __extension__ __PRETTY_FUNCTION__))
25294 "LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25294, __extension__ __PRETTY_FUNCTION__))
;
25295 assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25295, __extension__ __PRETTY_FUNCTION__))
;
25296
25297 MachineFunction &MF = DAG.getMachineFunction();
25298 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25299 // The Win64 ABI uses char* instead of a structure.
25300 return DAG.expandVAArg(Op.getNode());
25301
25302 SDValue Chain = Op.getOperand(0);
25303 SDValue SrcPtr = Op.getOperand(1);
25304 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25305 unsigned Align = Op.getConstantOperandVal(3);
25306 SDLoc dl(Op);
25307
25308 EVT ArgVT = Op.getNode()->getValueType(0);
25309 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25310 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25311 uint8_t ArgMode;
25312
25313 // Decide which area this value should be read from.
25314 // TODO: Implement the AMD64 ABI in its entirety. This simple
25315 // selection mechanism works only for the basic types.
25316 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25316, __extension__ __PRETTY_FUNCTION__))
;
25317 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25318 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25319 } else {
25320 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25321, __extension__ __PRETTY_FUNCTION__))
25321 "Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25321, __extension__ __PRETTY_FUNCTION__))
;
25322 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25323 }
25324
25325 if (ArgMode == 2) {
25326 // Sanity Check: Make sure using fp_offset makes sense.
25327 assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25329, __extension__ __PRETTY_FUNCTION__))
25328 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25329, __extension__ __PRETTY_FUNCTION__))
25329 Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25329, __extension__ __PRETTY_FUNCTION__))
;
25330 }
25331
25332 // Insert VAARG node into the DAG
25333 // VAARG returns two values: Variable Argument Address, Chain
25334 SDValue InstOps[] = {Chain, SrcPtr,
25335 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25336 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25337 DAG.getTargetConstant(Align, dl, MVT::i32)};
25338 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25339 SDValue VAARG = DAG.getMemIntrinsicNode(
25340 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
25341 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25342 /*Alignment=*/None,
25343 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
25344 Chain = VAARG.getValue(1);
25345
25346 // Load the next argument and return it
25347 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25348}
25349
25350static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25351 SelectionDAG &DAG) {
25352 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25353 // where a va_list is still an i8*.
25354 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25354, __extension__ __PRETTY_FUNCTION__))
;
25355 if (Subtarget.isCallingConvWin64(
25356 DAG.getMachineFunction().getFunction().getCallingConv()))
25357 // Probably a Win64 va_copy.
25358 return DAG.expandVACopy(Op.getNode());
25359
25360 SDValue Chain = Op.getOperand(0);
25361 SDValue DstPtr = Op.getOperand(1);
25362 SDValue SrcPtr = Op.getOperand(2);
25363 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25364 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25365 SDLoc DL(Op);
25366
25367 return DAG.getMemcpy(
25368 Chain, DL, DstPtr, SrcPtr,
25369 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25370 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25371 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
25372}
25373
25374// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25375static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25376 switch (Opc) {
25377 case ISD::SHL:
25378 case X86ISD::VSHL:
25379 case X86ISD::VSHLI:
25380 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25381 case ISD::SRL:
25382 case X86ISD::VSRL:
25383 case X86ISD::VSRLI:
25384 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25385 case ISD::SRA:
25386 case X86ISD::VSRA:
25387 case X86ISD::VSRAI:
25388 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25389 }
25390 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25390)
;
25391}
25392
25393/// Handle vector element shifts where the shift amount is a constant.
25394/// Takes immediate version of shift as input.
25395static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25396 SDValue SrcOp, uint64_t ShiftAmt,
25397 SelectionDAG &DAG) {
25398 MVT ElementType = VT.getVectorElementType();
25399
25400 // Bitcast the source vector to the output type, this is mainly necessary for
25401 // vXi8/vXi64 shifts.
25402 if (VT != SrcOp.getSimpleValueType())
25403 SrcOp = DAG.getBitcast(VT, SrcOp);
25404
25405 // Fold this packed shift into its first operand if ShiftAmt is 0.
25406 if (ShiftAmt == 0)
25407 return SrcOp;
25408
25409 // Check for ShiftAmt >= element width
25410 if (ShiftAmt >= ElementType.getSizeInBits()) {
25411 if (Opc == X86ISD::VSRAI)
25412 ShiftAmt = ElementType.getSizeInBits() - 1;
25413 else
25414 return DAG.getConstant(0, dl, VT);
25415 }
25416
25417 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25418, __extension__ __PRETTY_FUNCTION__))
25418 && "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25418, __extension__ __PRETTY_FUNCTION__))
;
25419
25420 // Fold this packed vector shift into a build vector if SrcOp is a
25421 // vector of Constants or UNDEFs.
25422 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
25423 SmallVector<SDValue, 8> Elts;
25424 unsigned NumElts = SrcOp->getNumOperands();
25425
25426 switch (Opc) {
25427 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25427)
;
25428 case X86ISD::VSHLI:
25429 for (unsigned i = 0; i != NumElts; ++i) {
25430 SDValue CurrentOp = SrcOp->getOperand(i);
25431 if (CurrentOp->isUndef()) {
25432 // Must produce 0s in the correct bits.
25433 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25434 continue;
25435 }
25436 auto *ND = cast<ConstantSDNode>(CurrentOp);
25437 const APInt &C = ND->getAPIntValue();
25438 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
25439 }
25440 break;
25441 case X86ISD::VSRLI:
25442 for (unsigned i = 0; i != NumElts; ++i) {
25443 SDValue CurrentOp = SrcOp->getOperand(i);
25444 if (CurrentOp->isUndef()) {
25445 // Must produce 0s in the correct bits.
25446 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25447 continue;
25448 }
25449 auto *ND = cast<ConstantSDNode>(CurrentOp);
25450 const APInt &C = ND->getAPIntValue();
25451 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
25452 }
25453 break;
25454 case X86ISD::VSRAI:
25455 for (unsigned i = 0; i != NumElts; ++i) {
25456 SDValue CurrentOp = SrcOp->getOperand(i);
25457 if (CurrentOp->isUndef()) {
25458 // All shifted in bits must be the same so use 0.
25459 Elts.push_back(DAG.getConstant(0, dl, ElementType));
25460 continue;
25461 }
25462 auto *ND = cast<ConstantSDNode>(CurrentOp);
25463 const APInt &C = ND->getAPIntValue();
25464 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
25465 }
25466 break;
25467 }
25468
25469 return DAG.getBuildVector(VT, dl, Elts);
25470 }
25471
25472 return DAG.getNode(Opc, dl, VT, SrcOp,
25473 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25474}
25475
25476/// Handle vector element shifts where the shift amount may or may not be a
25477/// constant. Takes immediate version of shift as input.
25478static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25479 SDValue SrcOp, SDValue ShAmt,
25480 const X86Subtarget &Subtarget,
25481 SelectionDAG &DAG) {
25482 MVT SVT = ShAmt.getSimpleValueType();
25483 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(static_cast <bool> ((SVT == MVT::i32 || SVT == MVT::i64
) && "Unexpected value type!") ? void (0) : __assert_fail
("(SVT == MVT::i32 || SVT == MVT::i64) && \"Unexpected value type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25483, __extension__ __PRETTY_FUNCTION__))
;
25484
25485 // Catch shift-by-constant.
25486 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
25487 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
25488 CShAmt->getZExtValue(), DAG);
25489
25490 // Change opcode to non-immediate version.
25491 Opc = getTargetVShiftUniformOpcode(Opc, true);
25492
25493 // Need to build a vector containing shift amount.
25494 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
25495 // +====================+============+=======================================+
25496 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
25497 // +====================+============+=======================================+
25498 // | i64 | Yes, No | Use ShAmt as lowest elt |
25499 // | i32 | Yes | zero-extend in-reg |
25500 // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
25501 // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
25502 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
25503 // +====================+============+=======================================+
25504
25505 if (SVT == MVT::i64)
25506 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
25507 else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
25508 ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25509 (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
25510 ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
25511 ShAmt = ShAmt.getOperand(0);
25512 MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
25513 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
25514 if (Subtarget.hasSSE41())
25515 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25516 MVT::v2i64, ShAmt);
25517 else {
25518 SDValue ByteShift = DAG.getTargetConstant(
25519 (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25520 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25521 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25522 ByteShift);
25523 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25524 ByteShift);
25525 }
25526 } else if (Subtarget.hasSSE41() &&
25527 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25528 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25529 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25530 MVT::v2i64, ShAmt);
25531 } else {
25532 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
25533 DAG.getUNDEF(SVT)};
25534 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
25535 }
25536
25537 // The return type has to be a 128-bit type with the same element
25538 // type as the input type.
25539 MVT EltVT = VT.getVectorElementType();
25540 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25541
25542 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25543 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25544}
25545
25546/// Return Mask with the necessary casting or extending
25547/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25548static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25549 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25550 const SDLoc &dl) {
25551
25552 if (isAllOnesConstant(Mask))
25553 return DAG.getConstant(1, dl, MaskVT);
25554 if (X86::isZeroNode(Mask))
25555 return DAG.getConstant(0, dl, MaskVT);
25556
25557 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25557, __extension__ __PRETTY_FUNCTION__))
;
25558
25559 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25560 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25560, __extension__ __PRETTY_FUNCTION__))
;
25561 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25561, __extension__ __PRETTY_FUNCTION__))
;
25562 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25563 SDValue Lo, Hi;
25564 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25565 DAG.getConstant(0, dl, MVT::i32));
25566 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
25567 DAG.getConstant(1, dl, MVT::i32));
25568
25569 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25570 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25571
25572 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25573 } else {
25574 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25575 Mask.getSimpleValueType().getSizeInBits());
25576 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25577 // are extracted by EXTRACT_SUBVECTOR.
25578 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25579 DAG.getBitcast(BitcastVT, Mask),
25580 DAG.getIntPtrConstant(0, dl));
25581 }
25582}
25583
25584/// Return (and \p Op, \p Mask) for compare instructions or
25585/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25586/// necessary casting or extending for \p Mask when lowering masking intrinsics
25587static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
25588 SDValue PreservedSrc,
25589 const X86Subtarget &Subtarget,
25590 SelectionDAG &DAG) {
25591 MVT VT = Op.getSimpleValueType();
25592 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25593 unsigned OpcodeSelect = ISD::VSELECT;
25594 SDLoc dl(Op);
25595
25596 if (isAllOnesConstant(Mask))
25597 return Op;
25598
25599 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25600
25601 if (PreservedSrc.isUndef())
25602 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25603 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25604}
25605
25606/// Creates an SDNode for a predicated scalar operation.
25607/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25608/// The mask is coming as MVT::i8 and it should be transformed
25609/// to MVT::v1i1 while lowering masking intrinsics.
25610/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25611/// "X86select" instead of "vselect". We just can't create the "vselect" node
25612/// for a scalar instruction.
25613static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
25614 SDValue PreservedSrc,
25615 const X86Subtarget &Subtarget,
25616 SelectionDAG &DAG) {
25617
25618 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25619 if (MaskConst->getZExtValue() & 0x1)
25620 return Op;
25621
25622 MVT VT = Op.getSimpleValueType();
25623 SDLoc dl(Op);
25624
25625 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25625, __extension__ __PRETTY_FUNCTION__))
;
25626 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25627 DAG.getBitcast(MVT::v8i1, Mask),
25628 DAG.getIntPtrConstant(0, dl));
25629 if (Op.getOpcode() == X86ISD::FSETCCM ||
25630 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25631 Op.getOpcode() == X86ISD::VFPCLASSS)
25632 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25633
25634 if (PreservedSrc.isUndef())
25635 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25636 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25637}
25638
25639static int getSEHRegistrationNodeSize(const Function *Fn) {
25640 if (!Fn->hasPersonalityFn())
25641 report_fatal_error(
25642 "querying registration node size for function without personality");
25643 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25644 // WinEHStatePass for the full struct definition.
25645 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25646 case EHPersonality::MSVC_X86SEH: return 24;
25647 case EHPersonality::MSVC_CXX: return 16;
25648 default: break;
25649 }
25650 report_fatal_error(
25651 "can only recover FP for 32-bit MSVC EH personality functions");
25652}
25653
25654/// When the MSVC runtime transfers control to us, either to an outlined
25655/// function or when returning to a parent frame after catching an exception, we
25656/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25657/// Here's the math:
25658/// RegNodeBase = EntryEBP - RegNodeSize
25659/// ParentFP = RegNodeBase - ParentFrameOffset
25660/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25661/// subtracting the offset (negative on x86) takes us back to the parent FP.
25662static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
25663 SDValue EntryEBP) {
25664 MachineFunction &MF = DAG.getMachineFunction();
25665 SDLoc dl;
25666
25667 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25668 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25669
25670 // It's possible that the parent function no longer has a personality function
25671 // if the exceptional code was optimized away, in which case we just return
25672 // the incoming EBP.
25673 if (!Fn->hasPersonalityFn())
25674 return EntryEBP;
25675
25676 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25677 // registration, or the .set_setframe offset.
25678 MCSymbol *OffsetSym =
25679 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
25680 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
25681 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25682 SDValue ParentFrameOffset =
25683 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25684
25685 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25686 // prologue to RBP in the parent function.
25687 const X86Subtarget &Subtarget =
25688 static_cast<const X86Subtarget &>(DAG.getSubtarget());
25689 if (Subtarget.is64Bit())
25690 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25691
25692 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25693 // RegNodeBase = EntryEBP - RegNodeSize
25694 // ParentFP = RegNodeBase - ParentFrameOffset
25695 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25696 DAG.getConstant(RegNodeSize, dl, PtrVT));
25697 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25698}
25699
25700SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25701 SelectionDAG &DAG) const {
25702 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25703 auto isRoundModeCurDirection = [](SDValue Rnd) {
25704 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25705 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25706
25707 return false;
25708 };
25709 auto isRoundModeSAE = [](SDValue Rnd) {
25710 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25711 unsigned RC = C->getZExtValue();
25712 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25713 // Clear the NO_EXC bit and check remaining bits.
25714 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25715 // As a convenience we allow no other bits or explicitly
25716 // current direction.
25717 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25718 }
25719 }
25720
25721 return false;
25722 };
25723 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25724 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25725 RC = C->getZExtValue();
25726 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
25727 // Clear the NO_EXC bit and check remaining bits.
25728 RC ^= X86::STATIC_ROUNDING::NO_EXC;
25729 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
25730 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
25731 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
25732 RC == X86::STATIC_ROUNDING::TO_ZERO;
25733 }
25734 }
25735
25736 return false;
25737 };
25738
25739 SDLoc dl(Op);
25740 unsigned IntNo = Op.getConstantOperandVal(0);
25741 MVT VT = Op.getSimpleValueType();
25742 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25743
25744 // Propagate flags from original node to transformed node(s).
25745 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25746
25747 if (IntrData) {
25748 switch(IntrData->Type) {
25749 case INTR_TYPE_1OP: {
25750 // We specify 2 possible opcodes for intrinsics with rounding modes.
25751 // First, we check if the intrinsic may have non-default rounding mode,
25752 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25753 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25754 if (IntrWithRoundingModeOpcode != 0) {
25755 SDValue Rnd = Op.getOperand(2);
25756 unsigned RC = 0;
25757 if (isRoundModeSAEToX(Rnd, RC))
25758 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25759 Op.getOperand(1),
25760 DAG.getTargetConstant(RC, dl, MVT::i32));
25761 if (!isRoundModeCurDirection(Rnd))
25762 return SDValue();
25763 }
25764 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25765 Op.getOperand(1));
25766 }
25767 case INTR_TYPE_1OP_SAE: {
25768 SDValue Sae = Op.getOperand(2);
25769
25770 unsigned Opc;
25771 if (isRoundModeCurDirection(Sae))
25772 Opc = IntrData->Opc0;
25773 else if (isRoundModeSAE(Sae))
25774 Opc = IntrData->Opc1;
25775 else
25776 return SDValue();
25777
25778 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25779 }
25780 case INTR_TYPE_2OP: {
25781 SDValue Src2 = Op.getOperand(2);
25782
25783 // We specify 2 possible opcodes for intrinsics with rounding modes.
25784 // First, we check if the intrinsic may have non-default rounding mode,
25785 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25786 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25787 if (IntrWithRoundingModeOpcode != 0) {
25788 SDValue Rnd = Op.getOperand(3);
25789 unsigned RC = 0;
25790 if (isRoundModeSAEToX(Rnd, RC))
25791 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25792 Op.getOperand(1), Src2,
25793 DAG.getTargetConstant(RC, dl, MVT::i32));
25794 if (!isRoundModeCurDirection(Rnd))
25795 return SDValue();
25796 }
25797
25798 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25799 Op.getOperand(1), Src2);
25800 }
25801 case INTR_TYPE_2OP_SAE: {
25802 SDValue Sae = Op.getOperand(3);
25803
25804 unsigned Opc;
25805 if (isRoundModeCurDirection(Sae))
25806 Opc = IntrData->Opc0;
25807 else if (isRoundModeSAE(Sae))
25808 Opc = IntrData->Opc1;
25809 else
25810 return SDValue();
25811
25812 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25813 Op.getOperand(2));
25814 }
25815 case INTR_TYPE_3OP:
25816 case INTR_TYPE_3OP_IMM8: {
25817 SDValue Src1 = Op.getOperand(1);
25818 SDValue Src2 = Op.getOperand(2);
25819 SDValue Src3 = Op.getOperand(3);
25820
25821 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25822 Src3.getValueType() != MVT::i8) {
25823 Src3 = DAG.getTargetConstant(
25824 cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);
25825 }
25826
25827 // We specify 2 possible opcodes for intrinsics with rounding modes.
25828 // First, we check if the intrinsic may have non-default rounding mode,
25829 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25830 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25831 if (IntrWithRoundingModeOpcode != 0) {
25832 SDValue Rnd = Op.getOperand(4);
25833 unsigned RC = 0;
25834 if (isRoundModeSAEToX(Rnd, RC))
25835 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25836 Src1, Src2, Src3,
25837 DAG.getTargetConstant(RC, dl, MVT::i32));
25838 if (!isRoundModeCurDirection(Rnd))
25839 return SDValue();
25840 }
25841
25842 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25843 {Src1, Src2, Src3});
25844 }
25845 case INTR_TYPE_4OP_IMM8: {
25846 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25846, __extension__ __PRETTY_FUNCTION__))
;
25847 SDValue Src4 = Op.getOperand(4);
25848 if (Src4.getValueType() != MVT::i8) {
25849 Src4 = DAG.getTargetConstant(
25850 cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);
25851 }
25852
25853 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25854 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25855 Src4);
25856 }
25857 case INTR_TYPE_1OP_MASK: {
25858 SDValue Src = Op.getOperand(1);
25859 SDValue PassThru = Op.getOperand(2);
25860 SDValue Mask = Op.getOperand(3);
25861 // We add rounding mode to the Node when
25862 // - RC Opcode is specified and
25863 // - RC is not "current direction".
25864 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25865 if (IntrWithRoundingModeOpcode != 0) {
25866 SDValue Rnd = Op.getOperand(4);
25867 unsigned RC = 0;
25868 if (isRoundModeSAEToX(Rnd, RC))
25869 return getVectorMaskingNode(
25870 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25871 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25872 Mask, PassThru, Subtarget, DAG);
25873 if (!isRoundModeCurDirection(Rnd))
25874 return SDValue();
25875 }
25876 return getVectorMaskingNode(
25877 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25878 Subtarget, DAG);
25879 }
25880 case INTR_TYPE_1OP_MASK_SAE: {
25881 SDValue Src = Op.getOperand(1);
25882 SDValue PassThru = Op.getOperand(2);
25883 SDValue Mask = Op.getOperand(3);
25884 SDValue Rnd = Op.getOperand(4);
25885
25886 unsigned Opc;
25887 if (isRoundModeCurDirection(Rnd))
25888 Opc = IntrData->Opc0;
25889 else if (isRoundModeSAE(Rnd))
25890 Opc = IntrData->Opc1;
25891 else
25892 return SDValue();
25893
25894 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25895 Subtarget, DAG);
25896 }
25897 case INTR_TYPE_SCALAR_MASK: {
25898 SDValue Src1 = Op.getOperand(1);
25899 SDValue Src2 = Op.getOperand(2);
25900 SDValue passThru = Op.getOperand(3);
25901 SDValue Mask = Op.getOperand(4);
25902 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25903 // There are 2 kinds of intrinsics in this group:
25904 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25905 // (2) With rounding mode and sae - 7 operands.
25906 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25907 if (Op.getNumOperands() == (5U + HasRounding)) {
25908 if (HasRounding) {
25909 SDValue Rnd = Op.getOperand(5);
25910 unsigned RC = 0;
25911 if (isRoundModeSAEToX(Rnd, RC))
25912 return getScalarMaskingNode(
25913 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25914 DAG.getTargetConstant(RC, dl, MVT::i32)),
25915 Mask, passThru, Subtarget, DAG);
25916 if (!isRoundModeCurDirection(Rnd))
25917 return SDValue();
25918 }
25919 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25920 Src2),
25921 Mask, passThru, Subtarget, DAG);
25922 }
25923
25924 assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25925, __extension__ __PRETTY_FUNCTION__))
25925 "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25925, __extension__ __PRETTY_FUNCTION__))
;
25926 SDValue RoundingMode = Op.getOperand(5);
25927 unsigned Opc = IntrData->Opc0;
25928 if (HasRounding) {
25929 SDValue Sae = Op.getOperand(6);
25930 if (isRoundModeSAE(Sae))
25931 Opc = IntrWithRoundingModeOpcode;
25932 else if (!isRoundModeCurDirection(Sae))
25933 return SDValue();
25934 }
25935 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25936 Src2, RoundingMode),
25937 Mask, passThru, Subtarget, DAG);
25938 }
25939 case INTR_TYPE_SCALAR_MASK_RND: {
25940 SDValue Src1 = Op.getOperand(1);
25941 SDValue Src2 = Op.getOperand(2);
25942 SDValue passThru = Op.getOperand(3);
25943 SDValue Mask = Op.getOperand(4);
25944 SDValue Rnd = Op.getOperand(5);
25945
25946 SDValue NewOp;
25947 unsigned RC = 0;
25948 if (isRoundModeCurDirection(Rnd))
25949 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25950 else if (isRoundModeSAEToX(Rnd, RC))
25951 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25952 DAG.getTargetConstant(RC, dl, MVT::i32));
25953 else
25954 return SDValue();
25955
25956 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25957 }
25958 case INTR_TYPE_SCALAR_MASK_SAE: {
25959 SDValue Src1 = Op.getOperand(1);
25960 SDValue Src2 = Op.getOperand(2);
25961 SDValue passThru = Op.getOperand(3);
25962 SDValue Mask = Op.getOperand(4);
25963 SDValue Sae = Op.getOperand(5);
25964 unsigned Opc;
25965 if (isRoundModeCurDirection(Sae))
25966 Opc = IntrData->Opc0;
25967 else if (isRoundModeSAE(Sae))
25968 Opc = IntrData->Opc1;
25969 else
25970 return SDValue();
25971
25972 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25973 Mask, passThru, Subtarget, DAG);
25974 }
25975 case INTR_TYPE_2OP_MASK: {
25976 SDValue Src1 = Op.getOperand(1);
25977 SDValue Src2 = Op.getOperand(2);
25978 SDValue PassThru = Op.getOperand(3);
25979 SDValue Mask = Op.getOperand(4);
25980 SDValue NewOp;
25981 if (IntrData->Opc1 != 0) {
25982 SDValue Rnd = Op.getOperand(5);
25983 unsigned RC = 0;
25984 if (isRoundModeSAEToX(Rnd, RC))
25985 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25986 DAG.getTargetConstant(RC, dl, MVT::i32));
25987 else if (!isRoundModeCurDirection(Rnd))
25988 return SDValue();
25989 }
25990 if (!NewOp)
25991 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25992 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25993 }
25994 case INTR_TYPE_2OP_MASK_SAE: {
25995 SDValue Src1 = Op.getOperand(1);
25996 SDValue Src2 = Op.getOperand(2);
25997 SDValue PassThru = Op.getOperand(3);
25998 SDValue Mask = Op.getOperand(4);
25999
26000 unsigned Opc = IntrData->Opc0;
26001 if (IntrData->Opc1 != 0) {
26002 SDValue Sae = Op.getOperand(5);
26003 if (isRoundModeSAE(Sae))
26004 Opc = IntrData->Opc1;
26005 else if (!isRoundModeCurDirection(Sae))
26006 return SDValue();
26007 }
26008
26009 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26010 Mask, PassThru, Subtarget, DAG);
26011 }
26012 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
26013 SDValue Src1 = Op.getOperand(1);
26014 SDValue Src2 = Op.getOperand(2);
26015 SDValue Src3 = Op.getOperand(3);
26016 SDValue PassThru = Op.getOperand(4);
26017 SDValue Mask = Op.getOperand(5);
26018 SDValue Sae = Op.getOperand(6);
26019 unsigned Opc;
26020 if (isRoundModeCurDirection(Sae))
26021 Opc = IntrData->Opc0;
26022 else if (isRoundModeSAE(Sae))
26023 Opc = IntrData->Opc1;
26024 else
26025 return SDValue();
26026
26027 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26028 Mask, PassThru, Subtarget, DAG);
26029 }
26030 case INTR_TYPE_3OP_MASK_SAE: {
26031 SDValue Src1 = Op.getOperand(1);
26032 SDValue Src2 = Op.getOperand(2);
26033 SDValue Src3 = Op.getOperand(3);
26034 SDValue PassThru = Op.getOperand(4);
26035 SDValue Mask = Op.getOperand(5);
26036
26037 unsigned Opc = IntrData->Opc0;
26038 if (IntrData->Opc1 != 0) {
26039 SDValue Sae = Op.getOperand(6);
26040 if (isRoundModeSAE(Sae))
26041 Opc = IntrData->Opc1;
26042 else if (!isRoundModeCurDirection(Sae))
26043 return SDValue();
26044 }
26045 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26046 Mask, PassThru, Subtarget, DAG);
26047 }
26048 case BLENDV: {
26049 SDValue Src1 = Op.getOperand(1);
26050 SDValue Src2 = Op.getOperand(2);
26051 SDValue Src3 = Op.getOperand(3);
26052
26053 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26054 Src3 = DAG.getBitcast(MaskVT, Src3);
26055
26056 // Reverse the operands to match VSELECT order.
26057 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26058 }
26059 case VPERM_2OP : {
26060 SDValue Src1 = Op.getOperand(1);
26061 SDValue Src2 = Op.getOperand(2);
26062
26063 // Swap Src1 and Src2 in the node creation
26064 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26065 }
26066 case IFMA_OP:
26067 // NOTE: We need to swizzle the operands to pass the multiply operands
26068 // first.
26069 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26070 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26071 case FPCLASSS: {
26072 SDValue Src1 = Op.getOperand(1);
26073 SDValue Imm = Op.getOperand(2);
26074 SDValue Mask = Op.getOperand(3);
26075 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26076 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26077 Subtarget, DAG);
26078 // Need to fill with zeros to ensure the bitcast will produce zeroes
26079 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26080 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26081 DAG.getConstant(0, dl, MVT::v8i1),
26082 FPclassMask, DAG.getIntPtrConstant(0, dl));
26083 return DAG.getBitcast(MVT::i8, Ins);
26084 }
26085
26086 case CMP_MASK_CC: {
26087 MVT MaskVT = Op.getSimpleValueType();
26088 SDValue CC = Op.getOperand(3);
26089 SDValue Mask = Op.getOperand(4);
26090 // We specify 2 possible opcodes for intrinsics with rounding modes.
26091 // First, we check if the intrinsic may have non-default rounding mode,
26092 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26093 if (IntrData->Opc1 != 0) {
26094 SDValue Sae = Op.getOperand(5);
26095 if (isRoundModeSAE(Sae))
26096 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26097 Op.getOperand(2), CC, Mask, Sae);
26098 if (!isRoundModeCurDirection(Sae))
26099 return SDValue();
26100 }
26101 //default rounding mode
26102 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26103 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26104 }
26105 case CMP_MASK_SCALAR_CC: {
26106 SDValue Src1 = Op.getOperand(1);
26107 SDValue Src2 = Op.getOperand(2);
26108 SDValue CC = Op.getOperand(3);
26109 SDValue Mask = Op.getOperand(4);
26110
26111 SDValue Cmp;
26112 if (IntrData->Opc1 != 0) {
26113 SDValue Sae = Op.getOperand(5);
26114 if (isRoundModeSAE(Sae))
26115 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26116 else if (!isRoundModeCurDirection(Sae))
26117 return SDValue();
26118 }
26119 //default rounding mode
26120 if (!Cmp.getNode())
26121 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26122
26123 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26124 Subtarget, DAG);
26125 // Need to fill with zeros to ensure the bitcast will produce zeroes
26126 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26127 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26128 DAG.getConstant(0, dl, MVT::v8i1),
26129 CmpMask, DAG.getIntPtrConstant(0, dl));
26130 return DAG.getBitcast(MVT::i8, Ins);
26131 }
26132 case COMI: { // Comparison intrinsics
26133 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26134 SDValue LHS = Op.getOperand(1);
26135 SDValue RHS = Op.getOperand(2);
26136 // Some conditions require the operands to be swapped.
26137 if (CC == ISD::SETLT || CC == ISD::SETLE)
26138 std::swap(LHS, RHS);
26139
26140 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
26141 SDValue SetCC;
26142 switch (CC) {
26143 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
26144 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26145 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26146 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26147 break;
26148 }
26149 case ISD::SETNE: { // (ZF = 1 or PF = 1)
26150 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26151 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26152 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26153 break;
26154 }
26155 case ISD::SETGT: // (CF = 0 and ZF = 0)
26156 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26157 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26158 break;
26159 }
26160 case ISD::SETGE: // CF = 0
26161 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26162 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26163 break;
26164 default:
26165 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26165)
;
26166 }
26167 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26168 }
26169 case COMI_RM: { // Comparison intrinsics with Sae
26170 SDValue LHS = Op.getOperand(1);
26171 SDValue RHS = Op.getOperand(2);
26172 unsigned CondVal = Op.getConstantOperandVal(3);
26173 SDValue Sae = Op.getOperand(4);
26174
26175 SDValue FCmp;
26176 if (isRoundModeCurDirection(Sae))
26177 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26178 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26179 else if (isRoundModeSAE(Sae))
26180 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26181 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26182 else
26183 return SDValue();
26184 // Need to fill with zeros to ensure the bitcast will produce zeroes
26185 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26186 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26187 DAG.getConstant(0, dl, MVT::v16i1),
26188 FCmp, DAG.getIntPtrConstant(0, dl));
26189 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26190 DAG.getBitcast(MVT::i16, Ins));
26191 }
26192 case VSHIFT:
26193 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26194 Op.getOperand(1), Op.getOperand(2), Subtarget,
26195 DAG);
26196 case COMPRESS_EXPAND_IN_REG: {
26197 SDValue Mask = Op.getOperand(3);
26198 SDValue DataToCompress = Op.getOperand(1);
26199 SDValue PassThru = Op.getOperand(2);
26200 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26201 return Op.getOperand(1);
26202
26203 // Avoid false dependency.
26204 if (PassThru.isUndef())
26205 PassThru = DAG.getConstant(0, dl, VT);
26206
26207 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26208 Mask);
26209 }
26210 case FIXUPIMM:
26211 case FIXUPIMM_MASKZ: {
26212 SDValue Src1 = Op.getOperand(1);
26213 SDValue Src2 = Op.getOperand(2);
26214 SDValue Src3 = Op.getOperand(3);
26215 SDValue Imm = Op.getOperand(4);
26216 SDValue Mask = Op.getOperand(5);
26217 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26218 ? Src1
26219 : getZeroVector(VT, Subtarget, DAG, dl);
26220
26221 unsigned Opc = IntrData->Opc0;
26222 if (IntrData->Opc1 != 0) {
26223 SDValue Sae = Op.getOperand(6);
26224 if (isRoundModeSAE(Sae))
26225 Opc = IntrData->Opc1;
26226 else if (!isRoundModeCurDirection(Sae))
26227 return SDValue();
26228 }
26229
26230 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26231
26232 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26233 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26234
26235 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26236 }
26237 case ROUNDP: {
26238 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26238, __extension__ __PRETTY_FUNCTION__))
;
26239 // Clear the upper bits of the rounding immediate so that the legacy
26240 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26241 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
26242 SDValue RoundingMode =
26243 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
26244 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26245 Op.getOperand(1), RoundingMode);
26246 }
26247 case ROUNDS: {
26248 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26248, __extension__ __PRETTY_FUNCTION__))
;
26249 // Clear the upper bits of the rounding immediate so that the legacy
26250 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26251 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
26252 SDValue RoundingMode =
26253 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
26254 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26255 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26256 }
26257 case BEXTRI: {
26258 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26258, __extension__ __PRETTY_FUNCTION__))
;
26259
26260 uint64_t Imm = Op.getConstantOperandVal(2);
26261 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26262 Op.getValueType());
26263 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26264 Op.getOperand(1), Control);
26265 }
26266 // ADC/ADCX/SBB
26267 case ADX: {
26268 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26269 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26270
26271 SDValue Res;
26272 // If the carry in is zero, then we should just use ADD/SUB instead of
26273 // ADC/SBB.
26274 if (isNullConstant(Op.getOperand(1))) {
26275 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26276 Op.getOperand(3));
26277 } else {
26278 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26279 DAG.getConstant(-1, dl, MVT::i8));
26280 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26281 Op.getOperand(3), GenCF.getValue(1));
26282 }
26283 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26284 SDValue Results[] = { SetCC, Res };
26285 return DAG.getMergeValues(Results, dl);
26286 }
26287 case CVTPD2PS_MASK:
26288 case CVTPD2DQ_MASK:
26289 case CVTQQ2PS_MASK:
26290 case TRUNCATE_TO_REG: {
26291 SDValue Src = Op.getOperand(1);
26292 SDValue PassThru = Op.getOperand(2);
26293 SDValue Mask = Op.getOperand(3);
26294
26295 if (isAllOnesConstant(Mask))
26296 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26297
26298 MVT SrcVT = Src.getSimpleValueType();
26299 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26300 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26301 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26302 {Src, PassThru, Mask});
26303 }
26304 case CVTPS2PH_MASK: {
26305 SDValue Src = Op.getOperand(1);
26306 SDValue Rnd = Op.getOperand(2);
26307 SDValue PassThru = Op.getOperand(3);
26308 SDValue Mask = Op.getOperand(4);
26309
26310 if (isAllOnesConstant(Mask))
26311 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
26312
26313 MVT SrcVT = Src.getSimpleValueType();
26314 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26315 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26316 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
26317 PassThru, Mask);
26318
26319 }
26320 case CVTNEPS2BF16_MASK: {
26321 SDValue Src = Op.getOperand(1);
26322 SDValue PassThru = Op.getOperand(2);
26323 SDValue Mask = Op.getOperand(3);
26324
26325 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26326 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26327
26328 // Break false dependency.
26329 if (PassThru.isUndef())
26330 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26331
26332 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26333 Mask);
26334 }
26335 default:
26336 break;
26337 }
26338 }
26339
26340 switch (IntNo) {
26341 default: return SDValue(); // Don't custom lower most intrinsics.
26342
26343 // ptest and testp intrinsics. The intrinsic these come from are designed to
26344 // return an integer value, not just an instruction so lower it to the ptest
26345 // or testp pattern and a setcc for the result.
26346 case Intrinsic::x86_avx512_ktestc_b:
26347 case Intrinsic::x86_avx512_ktestc_w:
26348 case Intrinsic::x86_avx512_ktestc_d:
26349 case Intrinsic::x86_avx512_ktestc_q:
26350 case Intrinsic::x86_avx512_ktestz_b:
26351 case Intrinsic::x86_avx512_ktestz_w:
26352 case Intrinsic::x86_avx512_ktestz_d:
26353 case Intrinsic::x86_avx512_ktestz_q:
26354 case Intrinsic::x86_sse41_ptestz:
26355 case Intrinsic::x86_sse41_ptestc:
26356 case Intrinsic::x86_sse41_ptestnzc:
26357 case Intrinsic::x86_avx_ptestz_256:
26358 case Intrinsic::x86_avx_ptestc_256:
26359 case Intrinsic::x86_avx_ptestnzc_256:
26360 case Intrinsic::x86_avx_vtestz_ps:
26361 case Intrinsic::x86_avx_vtestc_ps:
26362 case Intrinsic::x86_avx_vtestnzc_ps:
26363 case Intrinsic::x86_avx_vtestz_pd:
26364 case Intrinsic::x86_avx_vtestc_pd:
26365 case Intrinsic::x86_avx_vtestnzc_pd:
26366 case Intrinsic::x86_avx_vtestz_ps_256:
26367 case Intrinsic::x86_avx_vtestc_ps_256:
26368 case Intrinsic::x86_avx_vtestnzc_ps_256:
26369 case Intrinsic::x86_avx_vtestz_pd_256:
26370 case Intrinsic::x86_avx_vtestc_pd_256:
26371 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26372 unsigned TestOpc = X86ISD::PTEST;
26373 X86::CondCode X86CC;
26374 switch (IntNo) {
26375 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26375)
;
26376 case Intrinsic::x86_avx512_ktestc_b:
26377 case Intrinsic::x86_avx512_ktestc_w:
26378 case Intrinsic::x86_avx512_ktestc_d:
26379 case Intrinsic::x86_avx512_ktestc_q:
26380 // CF = 1
26381 TestOpc = X86ISD::KTEST;
26382 X86CC = X86::COND_B;
26383 break;
26384 case Intrinsic::x86_avx512_ktestz_b:
26385 case Intrinsic::x86_avx512_ktestz_w:
26386 case Intrinsic::x86_avx512_ktestz_d:
26387 case Intrinsic::x86_avx512_ktestz_q:
26388 TestOpc = X86ISD::KTEST;
26389 X86CC = X86::COND_E;
26390 break;
26391 case Intrinsic::x86_avx_vtestz_ps:
26392 case Intrinsic::x86_avx_vtestz_pd:
26393 case Intrinsic::x86_avx_vtestz_ps_256:
26394 case Intrinsic::x86_avx_vtestz_pd_256:
26395 TestOpc = X86ISD::TESTP;
26396 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26397 case Intrinsic::x86_sse41_ptestz:
26398 case Intrinsic::x86_avx_ptestz_256:
26399 // ZF = 1
26400 X86CC = X86::COND_E;
26401 break;
26402 case Intrinsic::x86_avx_vtestc_ps:
26403 case Intrinsic::x86_avx_vtestc_pd:
26404 case Intrinsic::x86_avx_vtestc_ps_256:
26405 case Intrinsic::x86_avx_vtestc_pd_256:
26406 TestOpc = X86ISD::TESTP;
26407 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26408 case Intrinsic::x86_sse41_ptestc:
26409 case Intrinsic::x86_avx_ptestc_256:
26410 // CF = 1
26411 X86CC = X86::COND_B;
26412 break;
26413 case Intrinsic::x86_avx_vtestnzc_ps:
26414 case Intrinsic::x86_avx_vtestnzc_pd:
26415 case Intrinsic::x86_avx_vtestnzc_ps_256:
26416 case Intrinsic::x86_avx_vtestnzc_pd_256:
26417 TestOpc = X86ISD::TESTP;
26418 LLVM_FALLTHROUGH[[gnu::fallthrough]];
26419 case Intrinsic::x86_sse41_ptestnzc:
26420 case Intrinsic::x86_avx_ptestnzc_256:
26421 // ZF and CF = 0
26422 X86CC = X86::COND_A;
26423 break;
26424 }
26425
26426 SDValue LHS = Op.getOperand(1);
26427 SDValue RHS = Op.getOperand(2);
26428 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26429 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26430 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26431 }
26432
26433 case Intrinsic::x86_sse42_pcmpistria128:
26434 case Intrinsic::x86_sse42_pcmpestria128:
26435 case Intrinsic::x86_sse42_pcmpistric128:
26436 case Intrinsic::x86_sse42_pcmpestric128:
26437 case Intrinsic::x86_sse42_pcmpistrio128:
26438 case Intrinsic::x86_sse42_pcmpestrio128:
26439 case Intrinsic::x86_sse42_pcmpistris128:
26440 case Intrinsic::x86_sse42_pcmpestris128:
26441 case Intrinsic::x86_sse42_pcmpistriz128:
26442 case Intrinsic::x86_sse42_pcmpestriz128: {
26443 unsigned Opcode;
26444 X86::CondCode X86CC;
26445 switch (IntNo) {
26446 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26446)
; // Can't reach here.
26447 case Intrinsic::x86_sse42_pcmpistria128:
26448 Opcode = X86ISD::PCMPISTR;
26449 X86CC = X86::COND_A;
26450 break;
26451 case Intrinsic::x86_sse42_pcmpestria128:
26452 Opcode = X86ISD::PCMPESTR;
26453 X86CC = X86::COND_A;
26454 break;
26455 case Intrinsic::x86_sse42_pcmpistric128:
26456 Opcode = X86ISD::PCMPISTR;
26457 X86CC = X86::COND_B;
26458 break;
26459 case Intrinsic::x86_sse42_pcmpestric128:
26460 Opcode = X86ISD::PCMPESTR;
26461 X86CC = X86::COND_B;
26462 break;
26463 case Intrinsic::x86_sse42_pcmpistrio128:
26464 Opcode = X86ISD::PCMPISTR;
26465 X86CC = X86::COND_O;
26466 break;
26467 case Intrinsic::x86_sse42_pcmpestrio128:
26468 Opcode = X86ISD::PCMPESTR;
26469 X86CC = X86::COND_O;
26470 break;
26471 case Intrinsic::x86_sse42_pcmpistris128:
26472 Opcode = X86ISD::PCMPISTR;
26473 X86CC = X86::COND_S;
26474 break;
26475 case Intrinsic::x86_sse42_pcmpestris128:
26476 Opcode = X86ISD::PCMPESTR;
26477 X86CC = X86::COND_S;
26478 break;
26479 case Intrinsic::x86_sse42_pcmpistriz128:
26480 Opcode = X86ISD::PCMPISTR;
26481 X86CC = X86::COND_E;
26482 break;
26483 case Intrinsic::x86_sse42_pcmpestriz128:
26484 Opcode = X86ISD::PCMPESTR;
26485 X86CC = X86::COND_E;
26486 break;
26487 }
26488 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26489 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26490 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26491 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26492 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26493 }
26494
26495 case Intrinsic::x86_sse42_pcmpistri128:
26496 case Intrinsic::x86_sse42_pcmpestri128: {
26497 unsigned Opcode;
26498 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26499 Opcode = X86ISD::PCMPISTR;
26500 else
26501 Opcode = X86ISD::PCMPESTR;
26502
26503 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26504 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26505 return DAG.getNode(Opcode, dl, VTs, NewOps);
26506 }
26507
26508 case Intrinsic::x86_sse42_pcmpistrm128:
26509 case Intrinsic::x86_sse42_pcmpestrm128: {
26510 unsigned Opcode;
26511 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26512 Opcode = X86ISD::PCMPISTR;
26513 else
26514 Opcode = X86ISD::PCMPESTR;
26515
26516 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
26517 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26518 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26519 }
26520
26521 case Intrinsic::eh_sjlj_lsda: {
26522 MachineFunction &MF = DAG.getMachineFunction();
26523 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26524 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26525 auto &Context = MF.getMMI().getContext();
26526 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26527 Twine(MF.getFunctionNumber()));
26528 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
26529 DAG.getMCSymbol(S, PtrVT));
26530 }
26531
26532 case Intrinsic::x86_seh_lsda: {
26533 // Compute the symbol for the LSDA. We know it'll get emitted later.
26534 MachineFunction &MF = DAG.getMachineFunction();
26535 SDValue Op1 = Op.getOperand(1);
26536 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26537 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
26538 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
26539
26540 // Generate a simple absolute symbol reference. This intrinsic is only
26541 // supported on 32-bit Windows, which isn't PIC.
26542 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26543 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26544 }
26545
26546 case Intrinsic::eh_recoverfp: {
26547 SDValue FnOp = Op.getOperand(1);
26548 SDValue IncomingFPOp = Op.getOperand(2);
26549 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26550 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26551 if (!Fn)
26552 report_fatal_error(
26553 "llvm.eh.recoverfp must take a function as the first argument");
26554 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26555 }
26556
26557 case Intrinsic::localaddress: {
26558 // Returns one of the stack, base, or frame pointer registers, depending on
26559 // which is used to reference local variables.
26560 MachineFunction &MF = DAG.getMachineFunction();
26561 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26562 unsigned Reg;
26563 if (RegInfo->hasBasePointer(MF))
26564 Reg = RegInfo->getBaseRegister();
26565 else { // Handles the SP or FP case.
26566 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26567 if (CantUseFP)
26568 Reg = RegInfo->getPtrSizedStackRegister(MF);
26569 else
26570 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26571 }
26572 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26573 }
26574 case Intrinsic::swift_async_context_addr: {
26575 auto &MF = DAG.getMachineFunction();
26576 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26577 if (Subtarget.is64Bit()) {
26578 MF.getFrameInfo().setFrameAddressIsTaken(true);
26579 X86FI->setHasSwiftAsyncContext(true);
26580 return SDValue(
26581 DAG.getMachineNode(
26582 X86::SUB64ri8, dl, MVT::i64,
26583 DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
26584 DAG.getTargetConstant(8, dl, MVT::i32)),
26585 0);
26586 } else {
26587 // 32-bit so no special extended frame, create or reuse an existing stack
26588 // slot.
26589 if (!X86FI->getSwiftAsyncContextFrameIdx())
26590 X86FI->setSwiftAsyncContextFrameIdx(
26591 MF.getFrameInfo().CreateStackObject(4, Align(4), false));
26592 return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
26593 }
26594 }
26595 case Intrinsic::x86_avx512_vp2intersect_q_512:
26596 case Intrinsic::x86_avx512_vp2intersect_q_256:
26597 case Intrinsic::x86_avx512_vp2intersect_q_128:
26598 case Intrinsic::x86_avx512_vp2intersect_d_512:
26599 case Intrinsic::x86_avx512_vp2intersect_d_256:
26600 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26601 MVT MaskVT = Op.getSimpleValueType();
26602
26603 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26604 SDLoc DL(Op);
26605
26606 SDValue Operation =
26607 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
26608 Op->getOperand(1), Op->getOperand(2));
26609
26610 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26611 MaskVT, Operation);
26612 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26613 MaskVT, Operation);
26614 return DAG.getMergeValues({Result0, Result1}, DL);
26615 }
26616 case Intrinsic::x86_mmx_pslli_w:
26617 case Intrinsic::x86_mmx_pslli_d:
26618 case Intrinsic::x86_mmx_pslli_q:
26619 case Intrinsic::x86_mmx_psrli_w:
26620 case Intrinsic::x86_mmx_psrli_d:
26621 case Intrinsic::x86_mmx_psrli_q:
26622 case Intrinsic::x86_mmx_psrai_w:
26623 case Intrinsic::x86_mmx_psrai_d: {
26624 SDLoc DL(Op);
26625 SDValue ShAmt = Op.getOperand(2);
26626 // If the argument is a constant, convert it to a target constant.
26627 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26628 // Clamp out of bounds shift amounts since they will otherwise be masked
26629 // to 8-bits which may make it no longer out of bounds.
26630 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26631 if (ShiftAmount == 0)
26632 return Op.getOperand(1);
26633
26634 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26635 Op.getOperand(0), Op.getOperand(1),
26636 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26637 }
26638
26639 unsigned NewIntrinsic;
26640 switch (IntNo) {
26641 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26641)
; // Can't reach here.
26642 case Intrinsic::x86_mmx_pslli_w:
26643 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26644 break;
26645 case Intrinsic::x86_mmx_pslli_d:
26646 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26647 break;
26648 case Intrinsic::x86_mmx_pslli_q:
26649 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26650 break;
26651 case Intrinsic::x86_mmx_psrli_w:
26652 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26653 break;
26654 case Intrinsic::x86_mmx_psrli_d:
26655 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26656 break;
26657 case Intrinsic::x86_mmx_psrli_q:
26658 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26659 break;
26660 case Intrinsic::x86_mmx_psrai_w:
26661 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26662 break;
26663 case Intrinsic::x86_mmx_psrai_d:
26664 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26665 break;
26666 }
26667
26668 // The vector shift intrinsics with scalars uses 32b shift amounts but
26669 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26670 // MMX register.
26671 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26672 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26673 DAG.getTargetConstant(NewIntrinsic, DL,
26674 getPointerTy(DAG.getDataLayout())),
26675 Op.getOperand(1), ShAmt);
26676 }
26677 }
26678}
26679
26680static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26681 SDValue Src, SDValue Mask, SDValue Base,
26682 SDValue Index, SDValue ScaleOp, SDValue Chain,
26683 const X86Subtarget &Subtarget) {
26684 SDLoc dl(Op);
26685 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26686 // Scale must be constant.
26687 if (!C)
26688 return SDValue();
26689 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26690 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26691 TLI.getPointerTy(DAG.getDataLayout()));
26692 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26693 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26694 // If source is undef or we know it won't be used, use a zero vector
26695 // to break register dependency.
26696 // TODO: use undef instead and let BreakFalseDeps deal with it?
26697 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26698 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26699
26700 // Cast mask to an integer type.
26701 Mask = DAG.getBitcast(MaskVT, Mask);
26702
26703 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26704
26705 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26706 SDValue Res =
26707 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26708 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26709 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26710}
26711
26712static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
26713 SDValue Src, SDValue Mask, SDValue Base,
26714 SDValue Index, SDValue ScaleOp, SDValue Chain,
26715 const X86Subtarget &Subtarget) {
26716 MVT VT = Op.getSimpleValueType();
26717 SDLoc dl(Op);
26718 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26719 // Scale must be constant.
26720 if (!C)
26721 return SDValue();
26722 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26723 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26724 TLI.getPointerTy(DAG.getDataLayout()));
26725 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26726 VT.getVectorNumElements());
26727 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26728
26729 // We support two versions of the gather intrinsics. One with scalar mask and
26730 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26731 if (Mask.getValueType() != MaskVT)
26732 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26733
26734 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26735 // If source is undef or we know it won't be used, use a zero vector
26736 // to break register dependency.
26737 // TODO: use undef instead and let BreakFalseDeps deal with it?
26738 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26739 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26740
26741 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26742
26743 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26744 SDValue Res =
26745 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26746 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26747 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26748}
26749
26750static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26751 SDValue Src, SDValue Mask, SDValue Base,
26752 SDValue Index, SDValue ScaleOp, SDValue Chain,
26753 const X86Subtarget &Subtarget) {
26754 SDLoc dl(Op);
26755 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26756 // Scale must be constant.
26757 if (!C)
26758 return SDValue();
26759 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26760 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26761 TLI.getPointerTy(DAG.getDataLayout()));
26762 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26763 Src.getSimpleValueType().getVectorNumElements());
26764 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26765
26766 // We support two versions of the scatter intrinsics. One with scalar mask and
26767 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26768 if (Mask.getValueType() != MaskVT)
26769 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26770
26771 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26772
26773 SDVTList VTs = DAG.getVTList(MVT::Other);
26774 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26775 SDValue Res =
26776 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26777 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26778 return Res;
26779}
26780
26781static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26782 SDValue Mask, SDValue Base, SDValue Index,
26783 SDValue ScaleOp, SDValue Chain,
26784 const X86Subtarget &Subtarget) {
26785 SDLoc dl(Op);
26786 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26787 // Scale must be constant.
26788 if (!C)
26789 return SDValue();
26790 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26791 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26792 TLI.getPointerTy(DAG.getDataLayout()));
26793 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26794 SDValue Segment = DAG.getRegister(0, MVT::i32);
26795 MVT MaskVT =
26796 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26797 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26798 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26799 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26800 return SDValue(Res, 0);
26801}
26802
26803/// Handles the lowering of builtin intrinsics with chain that return their
26804/// value into registers EDX:EAX.
26805/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26806/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26807/// TargetOpcode.
26808/// Returns a Glue value which can be used to add extra copy-from-reg if the
26809/// expanded intrinsics implicitly defines extra registers (i.e. not just
26810/// EDX:EAX).
26811static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
26812 SelectionDAG &DAG,
26813 unsigned TargetOpcode,
26814 unsigned SrcReg,
26815 const X86Subtarget &Subtarget,
26816 SmallVectorImpl<SDValue> &Results) {
26817 SDValue Chain = N->getOperand(0);
26818 SDValue Glue;
26819
26820 if (SrcReg) {
26821 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26821, __extension__ __PRETTY_FUNCTION__))
;
26822 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26823 Glue = Chain.getValue(1);
26824 }
26825
26826 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26827 SDValue N1Ops[] = {Chain, Glue};
26828 SDNode *N1 = DAG.getMachineNode(
26829 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26830 Chain = SDValue(N1, 0);
26831
26832 // Reads the content of XCR and returns it in registers EDX:EAX.
26833 SDValue LO, HI;
26834 if (Subtarget.is64Bit()) {
26835 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26836 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26837 LO.getValue(2));
26838 } else {
26839 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26840 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26841 LO.getValue(2));
26842 }
26843 Chain = HI.getValue(1);
26844 Glue = HI.getValue(2);
26845
26846 if (Subtarget.is64Bit()) {
26847 // Merge the two 32-bit values into a 64-bit one.
26848 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26849 DAG.getConstant(32, DL, MVT::i8));
26850 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26851 Results.push_back(Chain);
26852 return Glue;
26853 }
26854
26855 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26856 SDValue Ops[] = { LO, HI };
26857 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26858 Results.push_back(Pair);
26859 Results.push_back(Chain);
26860 return Glue;
26861}
26862
26863/// Handles the lowering of builtin intrinsics that read the time stamp counter
26864/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26865/// READCYCLECOUNTER nodes.
26866static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26867 SelectionDAG &DAG,
26868 const X86Subtarget &Subtarget,
26869 SmallVectorImpl<SDValue> &Results) {
26870 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26871 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26872 // and the EAX register is loaded with the low-order 32 bits.
26873 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26874 /* NoRegister */0, Subtarget,
26875 Results);
26876 if (Opcode != X86::RDTSCP)
26877 return;
26878
26879 SDValue Chain = Results[1];
26880 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26881 // the ECX register. Add 'ecx' explicitly to the chain.
26882 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26883 Results[1] = ecx;
26884 Results.push_back(ecx.getValue(1));
26885}
26886
26887static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
26888 SelectionDAG &DAG) {
26889 SmallVector<SDValue, 3> Results;
26890 SDLoc DL(Op);
26891 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26892 Results);
26893 return DAG.getMergeValues(Results, DL);
26894}
26895
26896static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
26897 MachineFunction &MF = DAG.getMachineFunction();
26898 SDValue Chain = Op.getOperand(0);
26899 SDValue RegNode = Op.getOperand(2);
26900 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26901 if (!EHInfo)
26902 report_fatal_error("EH registrations only live in functions using WinEH");
26903
26904 // Cast the operand to an alloca, and remember the frame index.
26905 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26906 if (!FINode)
26907 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26908 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26909
26910 // Return the chain operand without making any DAG nodes.
26911 return Chain;
26912}
26913
26914static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
26915 MachineFunction &MF = DAG.getMachineFunction();
26916 SDValue Chain = Op.getOperand(0);
26917 SDValue EHGuard = Op.getOperand(2);
26918 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26919 if (!EHInfo)
26920 report_fatal_error("EHGuard only live in functions using WinEH");
26921
26922 // Cast the operand to an alloca, and remember the frame index.
26923 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26924 if (!FINode)
26925 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26926 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26927
26928 // Return the chain operand without making any DAG nodes.
26929 return Chain;
26930}
26931
26932/// Emit Truncating Store with signed or unsigned saturation.
26933static SDValue
26934EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
26935 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26936 SelectionDAG &DAG) {
26937 SDVTList VTs = DAG.getVTList(MVT::Other);
26938 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26939 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26940 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26941 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26942}
26943
26944/// Emit Masked Truncating Store with signed or unsigned saturation.
26945static SDValue
26946EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
26947 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26948 MachineMemOperand *MMO, SelectionDAG &DAG) {
26949 SDVTList VTs = DAG.getVTList(MVT::Other);
26950 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26951 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26952 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
26953}
26954
26955static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
26956 SelectionDAG &DAG) {
26957 unsigned IntNo = Op.getConstantOperandVal(1);
26958 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26959 if (!IntrData) {
26960 switch (IntNo) {
26961 case llvm::Intrinsic::x86_seh_ehregnode:
26962 return MarkEHRegistrationNode(Op, DAG);
26963 case llvm::Intrinsic::x86_seh_ehguard:
26964 return MarkEHGuard(Op, DAG);
26965 case llvm::Intrinsic::x86_rdpkru: {
26966 SDLoc dl(Op);
26967 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26968 // Create a RDPKRU node and pass 0 to the ECX parameter.
26969 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26970 DAG.getConstant(0, dl, MVT::i32));
26971 }
26972 case llvm::Intrinsic::x86_wrpkru: {
26973 SDLoc dl(Op);
26974 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26975 // to the EDX and ECX parameters.
26976 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26977 Op.getOperand(0), Op.getOperand(2),
26978 DAG.getConstant(0, dl, MVT::i32),
26979 DAG.getConstant(0, dl, MVT::i32));
26980 }
26981 case llvm::Intrinsic::x86_flags_read_u32:
26982 case llvm::Intrinsic::x86_flags_read_u64:
26983 case llvm::Intrinsic::x86_flags_write_u32:
26984 case llvm::Intrinsic::x86_flags_write_u64: {
26985 // We need a frame pointer because this will get lowered to a PUSH/POP
26986 // sequence.
26987 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
26988 MFI.setHasCopyImplyingStackAdjustment(true);
26989 // Don't do anything here, we will expand these intrinsics out later
26990 // during FinalizeISel in EmitInstrWithCustomInserter.
26991 return Op;
26992 }
26993 case Intrinsic::x86_lwpins32:
26994 case Intrinsic::x86_lwpins64:
26995 case Intrinsic::x86_umwait:
26996 case Intrinsic::x86_tpause: {
26997 SDLoc dl(Op);
26998 SDValue Chain = Op->getOperand(0);
26999 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27000 unsigned Opcode;
27001
27002 switch (IntNo) {
27003 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27003)
;
27004 case Intrinsic::x86_umwait:
27005 Opcode = X86ISD::UMWAIT;
27006 break;
27007 case Intrinsic::x86_tpause:
27008 Opcode = X86ISD::TPAUSE;
27009 break;
27010 case Intrinsic::x86_lwpins32:
27011 case Intrinsic::x86_lwpins64:
27012 Opcode = X86ISD::LWPINS;
27013 break;
27014 }
27015
27016 SDValue Operation =
27017 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27018 Op->getOperand(3), Op->getOperand(4));
27019 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27020 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27021 Operation.getValue(1));
27022 }
27023 case Intrinsic::x86_enqcmd:
27024 case Intrinsic::x86_enqcmds: {
27025 SDLoc dl(Op);
27026 SDValue Chain = Op.getOperand(0);
27027 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27028 unsigned Opcode;
27029 switch (IntNo) {
27030 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27030)
;
27031 case Intrinsic::x86_enqcmd:
27032 Opcode = X86ISD::ENQCMD;
27033 break;
27034 case Intrinsic::x86_enqcmds:
27035 Opcode = X86ISD::ENQCMDS;
27036 break;
27037 }
27038 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27039 Op.getOperand(3));
27040 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27041 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27042 Operation.getValue(1));
27043 }
27044 case Intrinsic::x86_aesenc128kl:
27045 case Intrinsic::x86_aesdec128kl:
27046 case Intrinsic::x86_aesenc256kl:
27047 case Intrinsic::x86_aesdec256kl: {
27048 SDLoc DL(Op);
27049 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27050 SDValue Chain = Op.getOperand(0);
27051 unsigned Opcode;
27052
27053 switch (IntNo) {
27054 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27054)
;
27055 case Intrinsic::x86_aesenc128kl:
27056 Opcode = X86ISD::AESENC128KL;
27057 break;
27058 case Intrinsic::x86_aesdec128kl:
27059 Opcode = X86ISD::AESDEC128KL;
27060 break;
27061 case Intrinsic::x86_aesenc256kl:
27062 Opcode = X86ISD::AESENC256KL;
27063 break;
27064 case Intrinsic::x86_aesdec256kl:
27065 Opcode = X86ISD::AESDEC256KL;
27066 break;
27067 }
27068
27069 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27070 MachineMemOperand *MMO = MemIntr->getMemOperand();
27071 EVT MemVT = MemIntr->getMemoryVT();
27072 SDValue Operation = DAG.getMemIntrinsicNode(
27073 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27074 MMO);
27075 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27076
27077 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27078 {ZF, Operation.getValue(0), Operation.getValue(2)});
27079 }
27080 case Intrinsic::x86_aesencwide128kl:
27081 case Intrinsic::x86_aesdecwide128kl:
27082 case Intrinsic::x86_aesencwide256kl:
27083 case Intrinsic::x86_aesdecwide256kl: {
27084 SDLoc DL(Op);
27085 SDVTList VTs = DAG.getVTList(
27086 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27087 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27088 SDValue Chain = Op.getOperand(0);
27089 unsigned Opcode;
27090
27091 switch (IntNo) {
27092 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27092)
;
27093 case Intrinsic::x86_aesencwide128kl:
27094 Opcode = X86ISD::AESENCWIDE128KL;
27095 break;
27096 case Intrinsic::x86_aesdecwide128kl:
27097 Opcode = X86ISD::AESDECWIDE128KL;
27098 break;
27099 case Intrinsic::x86_aesencwide256kl:
27100 Opcode = X86ISD::AESENCWIDE256KL;
27101 break;
27102 case Intrinsic::x86_aesdecwide256kl:
27103 Opcode = X86ISD::AESDECWIDE256KL;
27104 break;
27105 }
27106
27107 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
27108 MachineMemOperand *MMO = MemIntr->getMemOperand();
27109 EVT MemVT = MemIntr->getMemoryVT();
27110 SDValue Operation = DAG.getMemIntrinsicNode(
27111 Opcode, DL, VTs,
27112 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27113 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27114 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27115 MemVT, MMO);
27116 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27117
27118 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27119 {ZF, Operation.getValue(1), Operation.getValue(2),
27120 Operation.getValue(3), Operation.getValue(4),
27121 Operation.getValue(5), Operation.getValue(6),
27122 Operation.getValue(7), Operation.getValue(8),
27123 Operation.getValue(9)});
27124 }
27125 case Intrinsic::x86_testui: {
27126 SDLoc dl(Op);
27127 SDValue Chain = Op.getOperand(0);
27128 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27129 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27130 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27131 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27132 Operation.getValue(1));
27133 }
27134 }
27135 return SDValue();
27136 }
27137
27138 SDLoc dl(Op);
27139 switch(IntrData->Type) {
27140 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27140)
;
27141 case RDSEED:
27142 case RDRAND: {
27143 // Emit the node with the right value type.
27144 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27145 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27146
27147 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27148 // Otherwise return the value from Rand, which is always 0, casted to i32.
27149 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27150 DAG.getConstant(1, dl, Op->getValueType(1)),
27151 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27152 SDValue(Result.getNode(), 1)};
27153 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27154
27155 // Return { result, isValid, chain }.
27156 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27157 SDValue(Result.getNode(), 2));
27158 }
27159 case GATHER_AVX2: {
27160 SDValue Chain = Op.getOperand(0);
27161 SDValue Src = Op.getOperand(2);
27162 SDValue Base = Op.getOperand(3);
27163 SDValue Index = Op.getOperand(4);
27164 SDValue Mask = Op.getOperand(5);
27165 SDValue Scale = Op.getOperand(6);
27166 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27167 Scale, Chain, Subtarget);
27168 }
27169 case GATHER: {
27170 //gather(v1, mask, index, base, scale);
27171 SDValue Chain = Op.getOperand(0);
27172 SDValue Src = Op.getOperand(2);
27173 SDValue Base = Op.getOperand(3);
27174 SDValue Index = Op.getOperand(4);
27175 SDValue Mask = Op.getOperand(5);
27176 SDValue Scale = Op.getOperand(6);
27177 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27178 Chain, Subtarget);
27179 }
27180 case SCATTER: {
27181 //scatter(base, mask, index, v1, scale);
27182 SDValue Chain = Op.getOperand(0);
27183 SDValue Base = Op.getOperand(2);
27184 SDValue Mask = Op.getOperand(3);
27185 SDValue Index = Op.getOperand(4);
27186 SDValue Src = Op.getOperand(5);
27187 SDValue Scale = Op.getOperand(6);
27188 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27189 Scale, Chain, Subtarget);
27190 }
27191 case PREFETCH: {
27192 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27193 assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27194, __extension__ __PRETTY_FUNCTION__))
27194 "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27194, __extension__ __PRETTY_FUNCTION__))
;
27195 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27196 SDValue Chain = Op.getOperand(0);
27197 SDValue Mask = Op.getOperand(2);
27198 SDValue Index = Op.getOperand(3);
27199 SDValue Base = Op.getOperand(4);
27200 SDValue Scale = Op.getOperand(5);
27201 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27202 Subtarget);
27203 }
27204 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27205 case RDTSC: {
27206 SmallVector<SDValue, 2> Results;
27207 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27208 Results);
27209 return DAG.getMergeValues(Results, dl);
27210 }
27211 // Read Performance Monitoring Counters.
27212 case RDPMC:
27213 // GetExtended Control Register.
27214 case XGETBV: {
27215 SmallVector<SDValue, 2> Results;
27216
27217 // RDPMC uses ECX to select the index of the performance counter to read.
27218 // XGETBV uses ECX to select the index of the XCR register to return.
27219 // The result is stored into registers EDX:EAX.
27220 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27221 Subtarget, Results);
27222 return DAG.getMergeValues(Results, dl);
27223 }
27224 // XTEST intrinsics.
27225 case XTEST: {
27226 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27227 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27228
27229 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27230 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27231 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27232 Ret, SDValue(InTrans.getNode(), 1));
27233 }
27234 case TRUNCATE_TO_MEM_VI8:
27235 case TRUNCATE_TO_MEM_VI16:
27236 case TRUNCATE_TO_MEM_VI32: {
27237 SDValue Mask = Op.getOperand(4);
27238 SDValue DataToTruncate = Op.getOperand(3);
27239 SDValue Addr = Op.getOperand(2);
27240 SDValue Chain = Op.getOperand(0);
27241
27242 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27243 assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27243, __extension__ __PRETTY_FUNCTION__))
;
27244
27245 EVT MemVT = MemIntr->getMemoryVT();
27246
27247 uint16_t TruncationOp = IntrData->Opc0;
27248 switch (TruncationOp) {
27249 case X86ISD::VTRUNC: {
27250 if (isAllOnesConstant(Mask)) // return just a truncate store
27251 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27252 MemIntr->getMemOperand());
27253
27254 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27255 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27256 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27257
27258 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27259 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27260 true /* truncating */);
27261 }
27262 case X86ISD::VTRUNCUS:
27263 case X86ISD::VTRUNCS: {
27264 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27265 if (isAllOnesConstant(Mask))
27266 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27267 MemIntr->getMemOperand(), DAG);
27268
27269 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27270 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27271
27272 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27273 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27274 }
27275 default:
27276 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27276)
;
27277 }
27278 }
27279 }
27280}
27281
27282SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27283 SelectionDAG &DAG) const {
27284 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
27285 MFI.setReturnAddressIsTaken(true);
27286
27287 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
27288 return SDValue();
27289
27290 unsigned Depth = Op.getConstantOperandVal(0);
27291 SDLoc dl(Op);
27292 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27293
27294 if (Depth > 0) {
27295 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27296 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27297 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27298 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27299 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27300 MachinePointerInfo());
27301 }
27302
27303 // Just load the return address.
27304 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27305 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27306 MachinePointerInfo());
27307}
27308
27309SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27310 SelectionDAG &DAG) const {
27311 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
27312 return getReturnAddressFrameIndex(DAG);
27313}
27314
27315SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27316 MachineFunction &MF = DAG.getMachineFunction();
27317 MachineFrameInfo &MFI = MF.getFrameInfo();
27318 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
27319 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27320 EVT VT = Op.getValueType();
27321
27322 MFI.setFrameAddressIsTaken(true);
27323
27324 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27325 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27326 // is not possible to crawl up the stack without looking at the unwind codes
27327 // simultaneously.
27328 int FrameAddrIndex = FuncInfo->getFAIndex();
27329 if (!FrameAddrIndex) {
27330 // Set up a frame object for the return address.
27331 unsigned SlotSize = RegInfo->getSlotSize();
27332 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27333 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27334 FuncInfo->setFAIndex(FrameAddrIndex);
27335 }
27336 return DAG.getFrameIndex(FrameAddrIndex, VT);
27337 }
27338
27339 unsigned FrameReg =
27340 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27341 SDLoc dl(Op); // FIXME probably not meaningful
27342 unsigned Depth = Op.getConstantOperandVal(0);
27343 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27345, __extension__ __PRETTY_FUNCTION__))
27344 (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27345, __extension__ __PRETTY_FUNCTION__))
27345 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27345, __extension__ __PRETTY_FUNCTION__))
;
27346 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27347 while (Depth--)
27348 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27349 MachinePointerInfo());
27350 return FrameAddr;
27351}
27352
27353// FIXME? Maybe this could be a TableGen attribute on some registers and
27354// this table could be generated automatically from RegInfo.
27355Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
27356 const MachineFunction &MF) const {
27357 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27358
27359 Register Reg = StringSwitch<unsigned>(RegName)
27360 .Case("esp", X86::ESP)
27361 .Case("rsp", X86::RSP)
27362 .Case("ebp", X86::EBP)
27363 .Case("rbp", X86::RBP)
27364 .Default(0);
27365
27366 if (Reg == X86::EBP || Reg == X86::RBP) {
27367 if (!TFI.hasFP(MF))
27368 report_fatal_error("register " + StringRef(RegName) +
27369 " is allocatable: function has no frame pointer");
27370#ifndef NDEBUG
27371 else {
27372 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27373 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27374 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27375, __extension__ __PRETTY_FUNCTION__))
27375 "Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27375, __extension__ __PRETTY_FUNCTION__))
;
27376 }
27377#endif
27378 }
27379
27380 if (Reg)
27381 return Reg;
27382
27383 report_fatal_error("Invalid register name global variable");
27384}
27385
27386SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27387 SelectionDAG &DAG) const {
27388 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27389 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27390}
27391
27392Register X86TargetLowering::getExceptionPointerRegister(
27393 const Constant *PersonalityFn) const {
27394 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27395 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27396
27397 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27398}
27399
27400Register X86TargetLowering::getExceptionSelectorRegister(
27401 const Constant *PersonalityFn) const {
27402 // Funclet personalities don't use selectors (the runtime does the selection).
27403 if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))
27404 return X86::NoRegister;
27405 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27406}
27407
27408bool X86TargetLowering::needsFixedCatchObjects() const {
27409 return Subtarget.isTargetWin64();
27410}
27411
27412SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27413 SDValue Chain = Op.getOperand(0);
27414 SDValue Offset = Op.getOperand(1);
27415 SDValue Handler = Op.getOperand(2);
27416 SDLoc dl (Op);
27417
27418 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27419 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27420 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27421 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27423, __extension__ __PRETTY_FUNCTION__))
27422 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27423, __extension__ __PRETTY_FUNCTION__))
27423 "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27423, __extension__ __PRETTY_FUNCTION__))
;
27424 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27425 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27426
27427 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27428 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27429 dl));
27430 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27431 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27432 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27433
27434 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27435 DAG.getRegister(StoreAddrReg, PtrVT));
27436}
27437
27438SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27439 SelectionDAG &DAG) const {
27440 SDLoc DL(Op);
27441 // If the subtarget is not 64bit, we may need the global base reg
27442 // after isel expand pseudo, i.e., after CGBR pass ran.
27443 // Therefore, ask for the GlobalBaseReg now, so that the pass
27444 // inserts the code for us in case we need it.
27445 // Otherwise, we will end up in a situation where we will
27446 // reference a virtual register that is not defined!
27447 if (!Subtarget.is64Bit()) {
27448 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27449 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27450 }
27451 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27452 DAG.getVTList(MVT::i32, MVT::Other),
27453 Op.getOperand(0), Op.getOperand(1));
27454}
27455
27456SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27457 SelectionDAG &DAG) const {
27458 SDLoc DL(Op);
27459 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27460 Op.getOperand(0), Op.getOperand(1));
27461}
27462
27463SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27464 SelectionDAG &DAG) const {
27465 SDLoc DL(Op);
27466 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27467 Op.getOperand(0));
27468}
27469
27470static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
27471 return Op.getOperand(0);
27472}
27473
27474SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27475 SelectionDAG &DAG) const {
27476 SDValue Root = Op.getOperand(0);
27477 SDValue Trmp = Op.getOperand(1); // trampoline
27478 SDValue FPtr = Op.getOperand(2); // nested function
27479 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27480 SDLoc dl (Op);
27481
27482 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27483 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27484
27485 if (Subtarget.is64Bit()) {
27486 SDValue OutChains[6];
27487
27488 // Large code-model.
27489 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27490 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27491
27492 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27493 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27494
27495 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27496
27497 // Load the pointer to the nested function into R11.
27498 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27499 SDValue Addr = Trmp;
27500 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27501 Addr, MachinePointerInfo(TrmpAddr));
27502
27503 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27504 DAG.getConstant(2, dl, MVT::i64));
27505 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27506 MachinePointerInfo(TrmpAddr, 2), Align(2));
27507
27508 // Load the 'nest' parameter value into R10.
27509 // R10 is specified in X86CallingConv.td
27510 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27511 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27512 DAG.getConstant(10, dl, MVT::i64));
27513 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27514 Addr, MachinePointerInfo(TrmpAddr, 10));
27515
27516 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27517 DAG.getConstant(12, dl, MVT::i64));
27518 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27519 MachinePointerInfo(TrmpAddr, 12), Align(2));
27520
27521 // Jump to the nested function.
27522 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27523 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27524 DAG.getConstant(20, dl, MVT::i64));
27525 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27526 Addr, MachinePointerInfo(TrmpAddr, 20));
27527
27528 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27529 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27530 DAG.getConstant(22, dl, MVT::i64));
27531 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27532 Addr, MachinePointerInfo(TrmpAddr, 22));
27533
27534 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27535 } else {
27536 const Function *Func =
27537 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27538 CallingConv::ID CC = Func->getCallingConv();
27539 unsigned NestReg;
27540
27541 switch (CC) {
27542 default:
27543 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27543)
;
27544 case CallingConv::C:
27545 case CallingConv::X86_StdCall: {
27546 // Pass 'nest' parameter in ECX.
27547 // Must be kept in sync with X86CallingConv.td
27548 NestReg = X86::ECX;
27549
27550 // Check that ECX wasn't needed by an 'inreg' parameter.
27551 FunctionType *FTy = Func->getFunctionType();
27552 const AttributeList &Attrs = Func->getAttributes();
27553
27554 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27555 unsigned InRegCount = 0;
27556 unsigned Idx = 0;
27557
27558 for (FunctionType::param_iterator I = FTy->param_begin(),
27559 E = FTy->param_end(); I != E; ++I, ++Idx)
27560 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27561 const DataLayout &DL = DAG.getDataLayout();
27562 // FIXME: should only count parameters that are lowered to integers.
27563 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27564 }
27565
27566 if (InRegCount > 2) {
27567 report_fatal_error("Nest register in use - reduce number of inreg"
27568 " parameters!");
27569 }
27570 }
27571 break;
27572 }
27573 case CallingConv::X86_FastCall:
27574 case CallingConv::X86_ThisCall:
27575 case CallingConv::Fast:
27576 case CallingConv::Tail:
27577 case CallingConv::SwiftTail:
27578 // Pass 'nest' parameter in EAX.
27579 // Must be kept in sync with X86CallingConv.td
27580 NestReg = X86::EAX;
27581 break;
27582 }
27583
27584 SDValue OutChains[4];
27585 SDValue Addr, Disp;
27586
27587 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27588 DAG.getConstant(10, dl, MVT::i32));
27589 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27590
27591 // This is storing the opcode for MOV32ri.
27592 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27593 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27594 OutChains[0] =
27595 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27596 Trmp, MachinePointerInfo(TrmpAddr));
27597
27598 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27599 DAG.getConstant(1, dl, MVT::i32));
27600 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27601 MachinePointerInfo(TrmpAddr, 1), Align(1));
27602
27603 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27604 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27605 DAG.getConstant(5, dl, MVT::i32));
27606 OutChains[2] =
27607 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27608 MachinePointerInfo(TrmpAddr, 5), Align(1));
27609
27610 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27611 DAG.getConstant(6, dl, MVT::i32));
27612 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27613 MachinePointerInfo(TrmpAddr, 6), Align(1));
27614
27615 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27616 }
27617}
27618
27619SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
27620 SelectionDAG &DAG) const {
27621 /*
27622 The rounding mode is in bits 11:10 of FPSR, and has the following
27623 settings:
27624 00 Round to nearest
27625 01 Round to -inf
27626 10 Round to +inf
27627 11 Round to 0
27628
27629 FLT_ROUNDS, on the other hand, expects the following:
27630 -1 Undefined
27631 0 Round to 0
27632 1 Round to nearest
27633 2 Round to +inf
27634 3 Round to -inf
27635
27636 To perform the conversion, we use a packed lookup table of the four 2-bit
27637 values that we can index by FPSP[11:10]
27638 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27639
27640 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27641 */
27642
27643 MachineFunction &MF = DAG.getMachineFunction();
27644 MVT VT = Op.getSimpleValueType();
27645 SDLoc DL(Op);
27646
27647 // Save FP Control Word to stack slot
27648 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27649 SDValue StackSlot =
27650 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27651
27652 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
27653
27654 SDValue Chain = Op.getOperand(0);
27655 SDValue Ops[] = {Chain, StackSlot};
27656 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
27657 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27658 Align(2), MachineMemOperand::MOStore);
27659
27660 // Load FP Control Word from stack slot
27661 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27662 Chain = CWD.getValue(1);
27663
27664 // Mask and turn the control bits into a shift for the lookup table.
27665 SDValue Shift =
27666 DAG.getNode(ISD::SRL, DL, MVT::i16,
27667 DAG.getNode(ISD::AND, DL, MVT::i16,
27668 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27669 DAG.getConstant(9, DL, MVT::i8));
27670 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27671
27672 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27673 SDValue RetVal =
27674 DAG.getNode(ISD::AND, DL, MVT::i32,
27675 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27676 DAG.getConstant(3, DL, MVT::i32));
27677
27678 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27679
27680 return DAG.getMergeValues({RetVal, Chain}, DL);
27681}
27682
27683SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27684 SelectionDAG &DAG) const {
27685 MachineFunction &MF = DAG.getMachineFunction();
27686 SDLoc DL(Op);
27687 SDValue Chain = Op.getNode()->getOperand(0);
27688
27689 // FP control word may be set only from data in memory. So we need to allocate
27690 // stack space to save/load FP control word.
27691 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27692 SDValue StackSlot =
27693 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27694 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
27695 MachineMemOperand *MMO =
27696 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));
27697
27698 // Store FP control word into memory.
27699 SDValue Ops[] = {Chain, StackSlot};
27700 Chain = DAG.getMemIntrinsicNode(
27701 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27702
27703 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27704 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27705 Chain = CWD.getValue(1);
27706 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27707 DAG.getConstant(0xf3ff, DL, MVT::i16));
27708
27709 // Calculate new rounding mode.
27710 SDValue NewRM = Op.getNode()->getOperand(1);
27711 SDValue RMBits;
27712 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27713 uint64_t RM = CVal->getZExtValue();
27714 int FieldVal;
27715 switch (static_cast<RoundingMode>(RM)) {
27716 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27717 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27718 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27719 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27720 default:
27721 llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27721)
;
27722 }
27723 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27724 } else {
27725 // Need to convert argument into bits of control word:
27726 // 0 Round to 0 -> 11
27727 // 1 Round to nearest -> 00
27728 // 2 Round to +inf -> 10
27729 // 3 Round to -inf -> 01
27730 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27731 // To make the conversion, put all these values into a value 0xc9 and shift
27732 // it left depending on the rounding mode:
27733 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27734 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27735 // ...
27736 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27737 SDValue ShiftValue =
27738 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27739 DAG.getNode(ISD::ADD, DL, MVT::i32,
27740 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27741 DAG.getConstant(1, DL, MVT::i8)),
27742 DAG.getConstant(4, DL, MVT::i32)));
27743 SDValue Shifted =
27744 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27745 ShiftValue);
27746 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27747 DAG.getConstant(0xc00, DL, MVT::i16));
27748 }
27749
27750 // Update rounding mode bits and store the new FP Control Word into stack.
27751 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27752 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2);
27753
27754 // Load FP control word from the slot.
27755 SDValue OpsLD[] = {Chain, StackSlot};
27756 MachineMemOperand *MMOL =
27757 MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));
27758 Chain = DAG.getMemIntrinsicNode(
27759 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27760
27761 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27762 // same way but in bits 14:13.
27763 if (Subtarget.hasSSE1()) {
27764 // Store MXCSR into memory.
27765 Chain = DAG.getNode(
27766 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27767 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27768 StackSlot);
27769
27770 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27771 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27772 Chain = CWD.getValue(1);
27773 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27774 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27775
27776 // Shift X87 RM bits from 11:10 to 14:13.
27777 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27778 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27779 DAG.getConstant(3, DL, MVT::i8));
27780
27781 // Update rounding mode bits and store the new FP Control Word into stack.
27782 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27783 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4);
27784
27785 // Load MXCSR from the slot.
27786 Chain = DAG.getNode(
27787 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27788 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27789 StackSlot);
27790 }
27791
27792 return Chain;
27793}
27794
27795/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27796//
27797// i8/i16 vector implemented using dword LZCNT vector instruction
27798// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27799// split the vector, perform operation on it's Lo a Hi part and
27800// concatenate the results.
27801static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
27802 const X86Subtarget &Subtarget) {
27803 assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27803, __extension__ __PRETTY_FUNCTION__))
;
27804 SDLoc dl(Op);
27805 MVT VT = Op.getSimpleValueType();
27806 MVT EltVT = VT.getVectorElementType();
27807 unsigned NumElems = VT.getVectorNumElements();
27808
27809 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27810, __extension__ __PRETTY_FUNCTION__))
27810 "Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27810, __extension__ __PRETTY_FUNCTION__))
;
27811
27812 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27813 if (NumElems > 16 ||
27814 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27815 return splitVectorIntUnary(Op, DAG);
27816
27817 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27818 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27819, __extension__ __PRETTY_FUNCTION__))
27819 "Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27819, __extension__ __PRETTY_FUNCTION__))
;
27820
27821 // Use native supported vector instruction vplzcntd.
27822 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27823 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27824 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27825 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27826
27827 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27828}
27829
27830// Lower CTLZ using a PSHUFB lookup table implementation.
27831static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
27832 const X86Subtarget &Subtarget,
27833 SelectionDAG &DAG) {
27834 MVT VT = Op.getSimpleValueType();
27835 int NumElts = VT.getVectorNumElements();
27836 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27837 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27838
27839 // Per-nibble leading zero PSHUFB lookup table.
27840 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27841 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27842 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27843 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27844
27845 SmallVector<SDValue, 64> LUTVec;
27846 for (int i = 0; i < NumBytes; ++i)
27847 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27848 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27849
27850 // Begin by bitcasting the input to byte vector, then split those bytes
27851 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
27852 // If the hi input nibble is zero then we add both results together, otherwise
27853 // we just take the hi result (by masking the lo result to zero before the
27854 // add).
27855 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27856 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27857
27858 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27859 SDValue Lo = Op0;
27860 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27861 SDValue HiZ;
27862 if (CurrVT.is512BitVector()) {
27863 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27864 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27865 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27866 } else {
27867 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27868 }
27869
27870 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27871 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27872 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27873 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27874
27875 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27876 // of the current vector width in the same way we did for the nibbles.
27877 // If the upper half of the input element is zero then add the halves'
27878 // leading zero counts together, otherwise just use the upper half's.
27879 // Double the width of the result until we are at target width.
27880 while (CurrVT != VT) {
27881 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27882 int CurrNumElts = CurrVT.getVectorNumElements();
27883 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27884 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27885 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27886
27887 // Check if the upper half of the input element is zero.
27888 if (CurrVT.is512BitVector()) {
27889 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27890 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27891 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27892 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27893 } else {
27894 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27895 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27896 }
27897 HiZ = DAG.getBitcast(NextVT, HiZ);
27898
27899 // Move the upper/lower halves to the lower bits as we'll be extending to
27900 // NextVT. Mask the lower result to zero if HiZ is true and add the results
27901 // together.
27902 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27903 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27904 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27905 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27906 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27907 CurrVT = NextVT;
27908 }
27909
27910 return Res;
27911}
27912
27913static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
27914 const X86Subtarget &Subtarget,
27915 SelectionDAG &DAG) {
27916 MVT VT = Op.getSimpleValueType();
27917
27918 if (Subtarget.hasCDI() &&
27919 // vXi8 vectors need to be promoted to 512-bits for vXi32.
27920 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27921 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27922
27923 // Decompose 256-bit ops into smaller 128-bit ops.
27924 if (VT.is256BitVector() && !Subtarget.hasInt256())
27925 return splitVectorIntUnary(Op, DAG);
27926
27927 // Decompose 512-bit ops into smaller 256-bit ops.
27928 if (VT.is512BitVector() && !Subtarget.hasBWI())
27929 return splitVectorIntUnary(Op, DAG);
27930
27931 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27931, __extension__ __PRETTY_FUNCTION__))
;
27932 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
27933}
27934
27935static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
27936 SelectionDAG &DAG) {
27937 MVT VT = Op.getSimpleValueType();
27938 MVT OpVT = VT;
27939 unsigned NumBits = VT.getSizeInBits();
27940 SDLoc dl(Op);
27941 unsigned Opc = Op.getOpcode();
27942
27943 if (VT.isVector())
27944 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
27945
27946 Op = Op.getOperand(0);
27947 if (VT == MVT::i8) {
27948 // Zero extend to i32 since there is not an i8 bsr.
27949 OpVT = MVT::i32;
27950 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
27951 }
27952
27953 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
27954 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
27955 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
27956
27957 if (Opc == ISD::CTLZ) {
27958 // If src is zero (i.e. bsr sets ZF), returns NumBits.
27959 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
27960 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27961 Op.getValue(1)};
27962 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
27963 }
27964
27965 // Finally xor with NumBits-1.
27966 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
27967 DAG.getConstant(NumBits - 1, dl, OpVT));
27968
27969 if (VT == MVT::i8)
27970 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
27971 return Op;
27972}
27973
27974static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
27975 SelectionDAG &DAG) {
27976 MVT VT = Op.getSimpleValueType();
27977 unsigned NumBits = VT.getScalarSizeInBits();
27978 SDValue N0 = Op.getOperand(0);
27979 SDLoc dl(Op);
27980
27981 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27982, __extension__ __PRETTY_FUNCTION__))
27982 "Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27982, __extension__ __PRETTY_FUNCTION__))
;
27983
27984 // Issue a bsf (scan bits forward) which also sets EFLAGS.
27985 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
27986 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
27987
27988 // If src is zero (i.e. bsf sets ZF), returns NumBits.
27989 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
27990 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
27991 Op.getValue(1)};
27992 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
27993}
27994
27995static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
27996 const X86Subtarget &Subtarget) {
27997 MVT VT = Op.getSimpleValueType();
27998 if (VT == MVT::i16 || VT == MVT::i32)
27999 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
28000
28001 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28002 return splitVectorIntBinary(Op, DAG);
28003
28004 assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28006, __extension__ __PRETTY_FUNCTION__))
28005 Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28006, __extension__ __PRETTY_FUNCTION__))
28006 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28006, __extension__ __PRETTY_FUNCTION__))
;
28007 return splitVectorIntBinary(Op, DAG);
28008}
28009
28010static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
28011 const X86Subtarget &Subtarget) {
28012 MVT VT = Op.getSimpleValueType();
28013 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28014 unsigned Opcode = Op.getOpcode();
28015 SDLoc DL(Op);
28016
28017 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28018 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28019 assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28020, __extension__ __PRETTY_FUNCTION__))
28020 "Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28020, __extension__ __PRETTY_FUNCTION__))
;
28021 return splitVectorIntBinary(Op, DAG);
28022 }
28023
28024 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28025 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28026 EVT SetCCResultType =
28027 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28028
28029 if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
28030 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28031 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28032 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28033 // TODO: Move this to DAGCombiner?
28034 if (SetCCResultType == VT &&
28035 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28036 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28037 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28038 }
28039
28040 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28041 (!VT.isVector() || VT == MVT::v2i64)) {
28042 unsigned BitWidth = VT.getScalarSizeInBits();
28043 APInt MinVal = APInt::getSignedMinValue(BitWidth);
28044 APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
28045 SDValue Zero = DAG.getConstant(0, DL, VT);
28046 SDValue Result =
28047 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28048 DAG.getVTList(VT, SetCCResultType), X, Y);
28049 SDValue SumDiff = Result.getValue(0);
28050 SDValue Overflow = Result.getValue(1);
28051 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28052 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28053 SDValue SumNeg =
28054 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28055 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28056 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28057 }
28058
28059 // Use default expansion.
28060 return SDValue();
28061}
28062
28063static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28064 SelectionDAG &DAG) {
28065 MVT VT = Op.getSimpleValueType();
28066 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28067 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28068 // 8-bit integer abs to NEG and CMOV.
28069 SDLoc DL(Op);
28070 SDValue N0 = Op.getOperand(0);
28071 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28072 DAG.getConstant(0, DL, VT), N0);
28073 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
28074 SDValue(Neg.getNode(), 1)};
28075 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28076 }
28077
28078 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28079 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28080 SDLoc DL(Op);
28081 SDValue Src = Op.getOperand(0);
28082 SDValue Sub =
28083 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
28084 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
28085 }
28086
28087 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28088 assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28089, __extension__ __PRETTY_FUNCTION__))
28089 "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28089, __extension__ __PRETTY_FUNCTION__))
;
28090 return splitVectorIntUnary(Op, DAG);
28091 }
28092
28093 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28094 return splitVectorIntUnary(Op, DAG);
28095
28096 // Default to expand.
28097 return SDValue();
28098}
28099
28100static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
28101 MVT VT = Op.getSimpleValueType();
28102
28103 // For AVX1 cases, split to use legal ops (everything but v4i64).
28104 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
28105 return splitVectorIntBinary(Op, DAG);
28106
28107 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28108 return splitVectorIntBinary(Op, DAG);
28109
28110 // Default to expand.
28111 return SDValue();
28112}
28113
28114static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28115 SelectionDAG &DAG) {
28116 SDLoc dl(Op);
28117 MVT VT = Op.getSimpleValueType();
28118
28119 // Decompose 256-bit ops into 128-bit ops.
28120 if (VT.is256BitVector() && !Subtarget.hasInt256())
28121 return splitVectorIntBinary(Op, DAG);
28122
28123 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28124 return splitVectorIntBinary(Op, DAG);
28125
28126 SDValue A = Op.getOperand(0);
28127 SDValue B = Op.getOperand(1);
28128
28129 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28130 // vector pairs, multiply and truncate.
28131 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28132 unsigned NumElts = VT.getVectorNumElements();
28133
28134 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28135 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28136 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28137 return DAG.getNode(
28138 ISD::TRUNCATE, dl, VT,
28139 DAG.getNode(ISD::MUL, dl, ExVT,
28140 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28141 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28142 }
28143
28144 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28145
28146 // Extract the lo/hi parts to any extend to i16.
28147 // We're going to mask off the low byte of each result element of the
28148 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28149 // element.
28150 SDValue Undef = DAG.getUNDEF(VT);
28151 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28152 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28153
28154 SDValue BLo, BHi;
28155 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28156 // If the RHS is a constant, manually unpackl/unpackh.
28157 SmallVector<SDValue, 16> LoOps, HiOps;
28158 for (unsigned i = 0; i != NumElts; i += 16) {
28159 for (unsigned j = 0; j != 8; ++j) {
28160 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28161 MVT::i16));
28162 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28163 MVT::i16));
28164 }
28165 }
28166
28167 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28168 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28169 } else {
28170 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28171 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28172 }
28173
28174 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28175 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28176 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28177 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
28178 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
28179 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28180 }
28181
28182 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28183 if (VT == MVT::v4i32) {
28184 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28185, __extension__ __PRETTY_FUNCTION__))
28185 "Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28185, __extension__ __PRETTY_FUNCTION__))
;
28186
28187 // Extract the odd parts.
28188 static const int UnpackMask[] = { 1, -1, 3, -1 };
28189 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28190 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28191
28192 // Multiply the even parts.
28193 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28194 DAG.getBitcast(MVT::v2i64, A),
28195 DAG.getBitcast(MVT::v2i64, B));
28196 // Now multiply odd parts.
28197 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28198 DAG.getBitcast(MVT::v2i64, Aodds),
28199 DAG.getBitcast(MVT::v2i64, Bodds));
28200
28201 Evens = DAG.getBitcast(VT, Evens);
28202 Odds = DAG.getBitcast(VT, Odds);
28203
28204 // Merge the two vectors back together with a shuffle. This expands into 2
28205 // shuffles.
28206 static const int ShufMask[] = { 0, 4, 2, 6 };
28207 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28208 }
28209
28210 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28211, __extension__ __PRETTY_FUNCTION__))
28211 "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28211, __extension__ __PRETTY_FUNCTION__))
;
28212 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28212, __extension__ __PRETTY_FUNCTION__))
;
28213
28214 // Ahi = psrlqi(a, 32);
28215 // Bhi = psrlqi(b, 32);
28216 //
28217 // AloBlo = pmuludq(a, b);
28218 // AloBhi = pmuludq(a, Bhi);
28219 // AhiBlo = pmuludq(Ahi, b);
28220 //
28221 // Hi = psllqi(AloBhi + AhiBlo, 32);
28222 // return AloBlo + Hi;
28223 KnownBits AKnown = DAG.computeKnownBits(A);
28224 KnownBits BKnown = DAG.computeKnownBits(B);
28225
28226 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28227 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28228 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28229
28230 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28231 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28232 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28233
28234 SDValue Zero = DAG.getConstant(0, dl, VT);
28235
28236 // Only multiply lo/hi halves that aren't known to be zero.
28237 SDValue AloBlo = Zero;
28238 if (!ALoIsZero && !BLoIsZero)
28239 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28240
28241 SDValue AloBhi = Zero;
28242 if (!ALoIsZero && !BHiIsZero) {
28243 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28244 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28245 }
28246
28247 SDValue AhiBlo = Zero;
28248 if (!AHiIsZero && !BLoIsZero) {
28249 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28250 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28251 }
28252
28253 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28254 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28255
28256 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28257}
28258
28259static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
28260 MVT VT, bool IsSigned,
28261 const X86Subtarget &Subtarget,
28262 SelectionDAG &DAG,
28263 SDValue *Low = nullptr) {
28264 unsigned NumElts = VT.getVectorNumElements();
28265
28266 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28267 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28268 // lane results back together.
28269
28270 // We'll take different approaches for signed and unsigned.
28271 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28272 // and use pmullw to calculate the full 16-bit product.
28273 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28274 // shift them left into the upper byte of each word. This allows us to use
28275 // pmulhw to calculate the full 16-bit product. This trick means we don't
28276 // need to sign extend the bytes to use pmullw.
28277
28278 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28279 SDValue Zero = DAG.getConstant(0, dl, VT);
28280
28281 SDValue ALo, AHi;
28282 if (IsSigned) {
28283 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28284 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28285 } else {
28286 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28287 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28288 }
28289
28290 SDValue BLo, BHi;
28291 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28292 // If the RHS is a constant, manually unpackl/unpackh and extend.
28293 SmallVector<SDValue, 16> LoOps, HiOps;
28294 for (unsigned i = 0; i != NumElts; i += 16) {
28295 for (unsigned j = 0; j != 8; ++j) {
28296 SDValue LoOp = B.getOperand(i + j);
28297 SDValue HiOp = B.getOperand(i + j + 8);
28298
28299 if (IsSigned) {
28300 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28301 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28302 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28303 DAG.getConstant(8, dl, MVT::i16));
28304 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28305 DAG.getConstant(8, dl, MVT::i16));
28306 } else {
28307 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28308 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28309 }
28310
28311 LoOps.push_back(LoOp);
28312 HiOps.push_back(HiOp);
28313 }
28314 }
28315
28316 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28317 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28318 } else if (IsSigned) {
28319 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28320 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28321 } else {
28322 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28323 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28324 }
28325
28326 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28327 // pack back to vXi8.
28328 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28329 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28330 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28331
28332 if (Low) {
28333 // Mask the lower bits and pack the results to rejoin the halves.
28334 SDValue Mask = DAG.getConstant(255, dl, ExVT);
28335 SDValue LLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, Mask);
28336 SDValue LHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, Mask);
28337 *Low = DAG.getNode(X86ISD::PACKUS, dl, VT, LLo, LHi);
28338 }
28339
28340 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
28341 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
28342
28343 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
28344 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
28345}
28346
28347static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28348 SelectionDAG &DAG) {
28349 SDLoc dl(Op);
28350 MVT VT = Op.getSimpleValueType();
28351 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28352 unsigned NumElts = VT.getVectorNumElements();
28353 SDValue A = Op.getOperand(0);
28354 SDValue B = Op.getOperand(1);
28355
28356 // Decompose 256-bit ops into 128-bit ops.
28357 if (VT.is256BitVector() && !Subtarget.hasInt256())
28358 return splitVectorIntBinary(Op, DAG);
28359
28360 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28361 return splitVectorIntBinary(Op, DAG);
28362
28363 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28364 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28366, __extension__ __PRETTY_FUNCTION__))
28365 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28366, __extension__ __PRETTY_FUNCTION__))
28366 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28366, __extension__ __PRETTY_FUNCTION__))
;
28367
28368 // PMULxD operations multiply each even value (starting at 0) of LHS with
28369 // the related value of RHS and produce a widen result.
28370 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28371 // => <2 x i64> <ae|cg>
28372 //
28373 // In other word, to have all the results, we need to perform two PMULxD:
28374 // 1. one with the even values.
28375 // 2. one with the odd values.
28376 // To achieve #2, with need to place the odd values at an even position.
28377 //
28378 // Place the odd value at an even position (basically, shift all values 1
28379 // step to the left):
28380 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28381 9, -1, 11, -1, 13, -1, 15, -1};
28382 // <a|b|c|d> => <b|undef|d|undef>
28383 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
28384 makeArrayRef(&Mask[0], NumElts));
28385 // <e|f|g|h> => <f|undef|h|undef>
28386 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
28387 makeArrayRef(&Mask[0], NumElts));
28388
28389 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28390 // ints.
28391 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28392 unsigned Opcode =
28393 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28394 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28395 // => <2 x i64> <ae|cg>
28396 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28397 DAG.getBitcast(MulVT, A),
28398 DAG.getBitcast(MulVT, B)));
28399 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28400 // => <2 x i64> <bf|dh>
28401 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28402 DAG.getBitcast(MulVT, Odd0),
28403 DAG.getBitcast(MulVT, Odd1)));
28404
28405 // Shuffle it back into the right order.
28406 SmallVector<int, 16> ShufMask(NumElts);
28407 for (int i = 0; i != (int)NumElts; ++i)
28408 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28409
28410 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28411
28412 // If we have a signed multiply but no PMULDQ fix up the result of an
28413 // unsigned multiply.
28414 if (IsSigned && !Subtarget.hasSSE41()) {
28415 SDValue Zero = DAG.getConstant(0, dl, VT);
28416 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28417 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28418 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28419 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28420
28421 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28422 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28423 }
28424
28425 return Res;
28426 }
28427
28428 // Only i8 vectors should need custom lowering after this.
28429 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28431, __extension__ __PRETTY_FUNCTION__))
28430 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28431, __extension__ __PRETTY_FUNCTION__))
28431 "Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28431, __extension__ __PRETTY_FUNCTION__))
;
28432
28433 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28434 // logical shift down the upper half and pack back to i8.
28435
28436 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28437 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28438
28439 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28440 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28441 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28442 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28443 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28444 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28445 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28446 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28447 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28448 }
28449
28450 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28451}
28452
28453// Custom lowering for SMULO/UMULO.
28454static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28455 SelectionDAG &DAG) {
28456 MVT VT = Op.getSimpleValueType();
28457
28458 // Scalars defer to LowerXALUO.
28459 if (!VT.isVector())
28460 return LowerXALUO(Op, DAG);
28461
28462 SDLoc dl(Op);
28463 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28464 SDValue A = Op.getOperand(0);
28465 SDValue B = Op.getOperand(1);
28466 EVT OvfVT = Op->getValueType(1);
28467
28468 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28469 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28470 // Extract the LHS Lo/Hi vectors
28471 SDValue LHSLo, LHSHi;
28472 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28473
28474 // Extract the RHS Lo/Hi vectors
28475 SDValue RHSLo, RHSHi;
28476 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28477
28478 EVT LoOvfVT, HiOvfVT;
28479 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28480 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28481 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28482
28483 // Issue the split operations.
28484 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28485 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28486
28487 // Join the separate data results and the overflow results.
28488 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28489 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28490 Hi.getValue(1));
28491
28492 return DAG.getMergeValues({Res, Ovf}, dl);
28493 }
28494
28495 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28496 EVT SetccVT =
28497 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28498
28499 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28500 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28501 unsigned NumElts = VT.getVectorNumElements();
28502 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28503 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28504 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28505 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28506 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28507
28508 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28509
28510 SDValue Ovf;
28511 if (IsSigned) {
28512 SDValue High, LowSign;
28513 if (OvfVT.getVectorElementType() == MVT::i1 &&
28514 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28515 // Rather the truncating try to do the compare on vXi16 or vXi32.
28516 // Shift the high down filling with sign bits.
28517 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28518 // Fill all 16 bits with the sign bit from the low.
28519 LowSign =
28520 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28521 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28522 15, DAG);
28523 SetccVT = OvfVT;
28524 if (!Subtarget.hasBWI()) {
28525 // We can't do a vXi16 compare so sign extend to v16i32.
28526 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28527 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28528 }
28529 } else {
28530 // Otherwise do the compare at vXi8.
28531 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28532 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28533 LowSign =
28534 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28535 }
28536
28537 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28538 } else {
28539 SDValue High =
28540 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28541 if (OvfVT.getVectorElementType() == MVT::i1 &&
28542 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28543 // Rather the truncating try to do the compare on vXi16 or vXi32.
28544 SetccVT = OvfVT;
28545 if (!Subtarget.hasBWI()) {
28546 // We can't do a vXi16 compare so sign extend to v16i32.
28547 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28548 }
28549 } else {
28550 // Otherwise do the compare at vXi8.
28551 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28552 }
28553
28554 Ovf =
28555 DAG.getSetCC(dl, SetccVT, High,
28556 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28557 }
28558
28559 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28560
28561 return DAG.getMergeValues({Low, Ovf}, dl);
28562 }
28563
28564 SDValue Low;
28565 SDValue High =
28566 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28567
28568 SDValue Ovf;
28569 if (IsSigned) {
28570 // SMULO overflows if the high bits don't match the sign of the low.
28571 SDValue LowSign =
28572 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28573 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28574 } else {
28575 // UMULO overflows if the high bits are non-zero.
28576 Ovf =
28577 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28578 }
28579
28580 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28581
28582 return DAG.getMergeValues({Low, Ovf}, dl);
28583}
28584
28585SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28586 assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28586, __extension__ __PRETTY_FUNCTION__))
;
28587 EVT VT = Op.getValueType();
28588 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28589, __extension__ __PRETTY_FUNCTION__))
28589 "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28589, __extension__ __PRETTY_FUNCTION__))
;
28590
28591 RTLIB::Libcall LC;
28592 bool isSigned;
28593 switch (Op->getOpcode()) {
28594 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28594)
;
28595 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28596 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28597 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28598 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28599 }
28600
28601 SDLoc dl(Op);
28602 SDValue InChain = DAG.getEntryNode();
28603
28604 TargetLowering::ArgListTy Args;
28605 TargetLowering::ArgListEntry Entry;
28606 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28607 EVT ArgVT = Op->getOperand(i).getValueType();
28608 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28609, __extension__ __PRETTY_FUNCTION__))
28609 "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28609, __extension__ __PRETTY_FUNCTION__))
;
28610 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28611 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28612 MachinePointerInfo MPI =
28613 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28614 Entry.Node = StackPtr;
28615 InChain =
28616 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28617 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28618 Entry.Ty = PointerType::get(ArgTy,0);
28619 Entry.IsSExt = false;
28620 Entry.IsZExt = false;
28621 Args.push_back(Entry);
28622 }
28623
28624 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
28625 getPointerTy(DAG.getDataLayout()));
28626
28627 TargetLowering::CallLoweringInfo CLI(DAG);
28628 CLI.setDebugLoc(dl)
28629 .setChain(InChain)
28630 .setLibCallee(
28631 getLibcallCallingConv(LC),
28632 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28633 std::move(Args))
28634 .setInRegister()
28635 .setSExtResult(isSigned)
28636 .setZExtResult(!isSigned);
28637
28638 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28639 return DAG.getBitcast(VT, CallInfo.first);
28640}
28641
28642// Return true if the required (according to Opcode) shift-imm form is natively
28643// supported by the Subtarget
28644static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
28645 unsigned Opcode) {
28646 if (VT.getScalarSizeInBits() < 16)
28647 return false;
28648
28649 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
28650 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
28651 return true;
28652
28653 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
28654 (VT.is256BitVector() && Subtarget.hasInt256());
28655
28656 bool AShift = LShift && (Subtarget.hasAVX512() ||
28657 (VT != MVT::v2i64 && VT != MVT::v4i64));
28658 return (Opcode == ISD::SRA) ? AShift : LShift;
28659}
28660
28661// The shift amount is a variable, but it is the same for all vector lanes.
28662// These instructions are defined together with shift-immediate.
28663static
28664bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
28665 unsigned Opcode) {
28666 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
28667}
28668
28669// Return true if the required (according to Opcode) variable-shift form is
28670// natively supported by the Subtarget
28671static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
28672 unsigned Opcode) {
28673
28674 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
28675 return false;
28676
28677 // vXi16 supported only on AVX-512, BWI
28678 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
28679 return false;
28680
28681 if (Subtarget.hasAVX512())
28682 return true;
28683
28684 bool LShift = VT.is128BitVector() || VT.is256BitVector();
28685 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
28686 return (Opcode == ISD::SRA) ? AShift : LShift;
28687}
28688
28689static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
28690 const X86Subtarget &Subtarget) {
28691 MVT VT = Op.getSimpleValueType();
28692 SDLoc dl(Op);
28693 SDValue R = Op.getOperand(0);
28694 SDValue Amt = Op.getOperand(1);
28695 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
28696
28697 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
28698 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28698, __extension__ __PRETTY_FUNCTION__))
;
28699 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
28700 SDValue Ex = DAG.getBitcast(ExVT, R);
28701
28702 // ashr(R, 63) === cmp_slt(R, 0)
28703 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
28704 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28705, __extension__ __PRETTY_FUNCTION__))
28705 "Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28705, __extension__ __PRETTY_FUNCTION__))
;
28706 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
28707 }
28708
28709 if (ShiftAmt >= 32) {
28710 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
28711 SDValue Upper =
28712 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
28713 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28714 ShiftAmt - 32, DAG);
28715 if (VT == MVT::v2i64)
28716 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
28717 if (VT == MVT::v4i64)
28718 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28719 {9, 1, 11, 3, 13, 5, 15, 7});
28720 } else {
28721 // SRA upper i32, SRL whole i64 and select lower i32.
28722 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
28723 ShiftAmt, DAG);
28724 SDValue Lower =
28725 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
28726 Lower = DAG.getBitcast(ExVT, Lower);
28727 if (VT == MVT::v2i64)
28728 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
28729 if (VT == MVT::v4i64)
28730 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
28731 {8, 1, 10, 3, 12, 5, 14, 7});
28732 }
28733 return DAG.getBitcast(VT, Ex);
28734 };
28735
28736 // Optimize shl/srl/sra with constant shift amount.
28737 APInt APIntShiftAmt;
28738 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
28739 return SDValue();
28740
28741 // If the shift amount is out of range, return undef.
28742 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
28743 return DAG.getUNDEF(VT);
28744
28745 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
28746
28747 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
28748 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
28749
28750 // i64 SRA needs to be performed as partial shifts.
28751 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
28752 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
28753 Op.getOpcode() == ISD::SRA)
28754 return ArithmeticShiftRight64(ShiftAmt);
28755
28756 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
28757 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
28758 unsigned NumElts = VT.getVectorNumElements();
28759 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28760
28761 // Simple i8 add case
28762 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
28763 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
28764 // must be 0). (add undef, undef) however can be any value. To make this
28765 // safe, we must freeze R to ensure that register allocation uses the same
28766 // register for an undefined value. This ensures that the result will
28767 // still be even and preserves the original semantics.
28768 R = DAG.getNode(ISD::FREEZE, dl, VT, R);
28769 return DAG.getNode(ISD::ADD, dl, VT, R, R);
28770 }
28771
28772 // ashr(R, 7) === cmp_slt(R, 0)
28773 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
28774 SDValue Zeros = DAG.getConstant(0, dl, VT);
28775 if (VT.is512BitVector()) {
28776 assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28776, __extension__ __PRETTY_FUNCTION__))
;
28777 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
28778 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
28779 }
28780 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
28781 }
28782
28783 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
28784 if (VT == MVT::v16i8 && Subtarget.hasXOP())
28785 return SDValue();
28786
28787 if (Op.getOpcode() == ISD::SHL) {
28788 // Make a large shift.
28789 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
28790 ShiftAmt, DAG);
28791 SHL = DAG.getBitcast(VT, SHL);
28792 // Zero out the rightmost bits.
28793 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
28794 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
28795 }
28796 if (Op.getOpcode() == ISD::SRL) {
28797 // Make a large shift.
28798 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
28799 ShiftAmt, DAG);
28800 SRL = DAG.getBitcast(VT, SRL);
28801 // Zero out the leftmost bits.
28802 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
28803 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
28804 }
28805 if (Op.getOpcode() == ISD::SRA) {
28806 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
28807 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
28808
28809 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
28810 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
28811 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
28812 return Res;
28813 }
28814 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28814)
;
28815 }
28816
28817 return SDValue();
28818}
28819
28820static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
28821 const X86Subtarget &Subtarget) {
28822 MVT VT = Op.getSimpleValueType();
28823 SDLoc dl(Op);
28824 SDValue R = Op.getOperand(0);
28825 SDValue Amt = Op.getOperand(1);
28826 unsigned Opcode = Op.getOpcode();
28827 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
28828 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
28829
28830 if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
28831 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
28832 MVT EltVT = VT.getVectorElementType();
28833 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")(static_cast <bool> (EltVT.bitsLE(MVT::i64) && "Unexpected element type!"
) ? void (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28833, __extension__ __PRETTY_FUNCTION__))
;
28834 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
28835 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
28836 else if (EltVT.bitsLT(MVT::i32))
28837 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28838
28839 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
28840 }
28841
28842 // vXi8 shifts - shift as v8i16 + mask result.
28843 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
28844 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
28845 VT == MVT::v64i8) &&
28846 !Subtarget.hasXOP()) {
28847 unsigned NumElts = VT.getVectorNumElements();
28848 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28849 if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
28850 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
28851 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
28852 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
28853
28854 // Create the mask using vXi16 shifts. For shift-rights we need to move
28855 // the upper byte down before splatting the vXi8 mask.
28856 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
28857 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
28858 BaseShAmt, Subtarget, DAG);
28859 if (Opcode != ISD::SHL)
28860 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
28861 8, DAG);
28862 BitMask = DAG.getBitcast(VT, BitMask);
28863 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
28864 SmallVector<int, 64>(NumElts, 0));
28865
28866 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
28867 DAG.getBitcast(ExtVT, R), BaseShAmt,
28868 Subtarget, DAG);
28869 Res = DAG.getBitcast(VT, Res);
28870 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
28871
28872 if (Opcode == ISD::SRA) {
28873 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
28874 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
28875 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
28876 SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
28877 BaseShAmt, Subtarget, DAG);
28878 SignMask = DAG.getBitcast(VT, SignMask);
28879 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
28880 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
28881 }
28882 return Res;
28883 }
28884 }
28885 }
28886
28887 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
28888 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
28889 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
28890 Amt = Amt.getOperand(0);
28891 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
28892 std::vector<SDValue> Vals(Ratio);
28893 for (unsigned i = 0; i != Ratio; ++i)
28894 Vals[i] = Amt.getOperand(i);
28895 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
28896 for (unsigned j = 0; j != Ratio; ++j)
28897 if (Vals[j] != Amt.getOperand(i + j))
28898 return SDValue();
28899 }
28900
28901 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
28902 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
28903 }
28904 return SDValue();
28905}
28906
28907// Convert a shift/rotate left amount to a multiplication scale factor.
28908static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
28909 const X86Subtarget &Subtarget,
28910 SelectionDAG &DAG) {
28911 MVT VT = Amt.getSimpleValueType();
28912 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
28913 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
28914 (Subtarget.hasVBMI2() && VT == MVT::v32i16) ||
28915 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
28916 return SDValue();
28917
28918 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
28919 SmallVector<SDValue, 8> Elts;
28920 MVT SVT = VT.getVectorElementType();
28921 unsigned SVTBits = SVT.getSizeInBits();
28922 APInt One(SVTBits, 1);
28923 unsigned NumElems = VT.getVectorNumElements();
28924
28925 for (unsigned i = 0; i != NumElems; ++i) {
28926 SDValue Op = Amt->getOperand(i);
28927 if (Op->isUndef()) {
28928 Elts.push_back(Op);
28929 continue;
28930 }
28931
28932 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
28933 APInt C(SVTBits, ND->getZExtValue());
28934 uint64_t ShAmt = C.getZExtValue();
28935 if (ShAmt >= SVTBits) {
28936 Elts.push_back(DAG.getUNDEF(SVT));
28937 continue;
28938 }
28939 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
28940 }
28941 return DAG.getBuildVector(VT, dl, Elts);
28942 }
28943
28944 // If the target doesn't support variable shifts, use either FP conversion
28945 // or integer multiplication to avoid shifting each element individually.
28946 if (VT == MVT::v4i32) {
28947 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
28948 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
28949 DAG.getConstant(0x3f800000U, dl, VT));
28950 Amt = DAG.getBitcast(MVT::v4f32, Amt);
28951 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
28952 }
28953
28954 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
28955 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
28956 SDValue Z = DAG.getConstant(0, dl, VT);
28957 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
28958 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
28959 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
28960 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
28961 if (Subtarget.hasSSE41())
28962 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
28963
28964 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
28965 DAG.getBitcast(VT, Hi),
28966 {0, 2, 4, 6, 8, 10, 12, 14});
28967 }
28968
28969 return SDValue();
28970}
28971
28972static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
28973 SelectionDAG &DAG) {
28974 MVT VT = Op.getSimpleValueType();
28975 SDLoc dl(Op);
28976 SDValue R = Op.getOperand(0);
28977 SDValue Amt = Op.getOperand(1);
28978 unsigned EltSizeInBits = VT.getScalarSizeInBits();
28979 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
28980
28981 unsigned Opc = Op.getOpcode();
28982 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
28983 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
28984
28985 assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28985, __extension__ __PRETTY_FUNCTION__))
;
28986 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28986, __extension__ __PRETTY_FUNCTION__))
;
28987
28988 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
28989 return V;
28990
28991 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
28992 return V;
28993
28994 if (SupportedVectorVarShift(VT, Subtarget, Opc))
28995 return Op;
28996
28997 // XOP has 128-bit variable logical/arithmetic shifts.
28998 // +ve/-ve Amt = shift left/right.
28999 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29000 VT == MVT::v8i16 || VT == MVT::v16i8)) {
29001 if (Opc == ISD::SRL || Opc == ISD::SRA) {
29002 SDValue Zero = DAG.getConstant(0, dl, VT);
29003 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
29004 }
29005 if (Opc == ISD::SHL || Opc == ISD::SRL)
29006 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29007 if (Opc == ISD::SRA)
29008 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29009 }
29010
29011 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29012 // shifts per-lane and then shuffle the partial results back together.
29013 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29014 // Splat the shift amounts so the scalar shifts above will catch it.
29015 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29016 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29017 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29018 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29019 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29020 }
29021
29022 // i64 vector arithmetic shift can be emulated with the transform:
29023 // M = lshr(SIGN_MASK, Amt)
29024 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29025 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29026 Opc == ISD::SRA) {
29027 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29028 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29029 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29030 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29031 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29032 return R;
29033 }
29034
29035 // If possible, lower this shift as a sequence of two shifts by
29036 // constant plus a BLENDing shuffle instead of scalarizing it.
29037 // Example:
29038 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29039 //
29040 // Could be rewritten as:
29041 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29042 //
29043 // The advantage is that the two shifts from the example would be
29044 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29045 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29046 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29047 SDValue Amt1, Amt2;
29048 unsigned NumElts = VT.getVectorNumElements();
29049 SmallVector<int, 8> ShuffleMask;
29050 for (unsigned i = 0; i != NumElts; ++i) {
29051 SDValue A = Amt->getOperand(i);
29052 if (A.isUndef()) {
29053 ShuffleMask.push_back(SM_SentinelUndef);
29054 continue;
29055 }
29056 if (!Amt1 || Amt1 == A) {
29057 ShuffleMask.push_back(i);
29058 Amt1 = A;
29059 continue;
29060 }
29061 if (!Amt2 || Amt2 == A) {
29062 ShuffleMask.push_back(i + NumElts);
29063 Amt2 = A;
29064 continue;
29065 }
29066 break;
29067 }
29068
29069 // Only perform this blend if we can perform it without loading a mask.
29070 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29071 (VT != MVT::v16i16 ||
29072 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29073 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29074 canWidenShuffleElements(ShuffleMask))) {
29075 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29076 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29077 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29078 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29079 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29080 Cst1->getZExtValue(), DAG);
29081 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29082 Cst2->getZExtValue(), DAG);
29083 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29084 }
29085 }
29086 }
29087
29088 // If possible, lower this packed shift into a vector multiply instead of
29089 // expanding it into a sequence of scalar shifts.
29090 if (Opc == ISD::SHL)
29091 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29092 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29093
29094 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29095 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29096 if (Opc == ISD::SRL && ConstantAmt &&
29097 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29098 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29099 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29100 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29101 SDValue Zero = DAG.getConstant(0, dl, VT);
29102 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29103 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29104 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29105 }
29106 }
29107
29108 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29109 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29110 // TODO: Special case handling for shift by 0/1, really we can afford either
29111 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29112 if (Opc == ISD::SRA && ConstantAmt &&
29113 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29114 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29115 !Subtarget.hasAVX512()) ||
29116 DAG.isKnownNeverZero(Amt))) {
29117 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29118 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29119 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29120 SDValue Amt0 =
29121 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29122 SDValue Amt1 =
29123 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29124 SDValue Sra1 =
29125 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29126 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29127 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29128 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29129 }
29130 }
29131
29132 // v4i32 Non Uniform Shifts.
29133 // If the shift amount is constant we can shift each lane using the SSE2
29134 // immediate shifts, else we need to zero-extend each lane to the lower i64
29135 // and shift using the SSE2 variable shifts.
29136 // The separate results can then be blended together.
29137 if (VT == MVT::v4i32) {
29138 SDValue Amt0, Amt1, Amt2, Amt3;
29139 if (ConstantAmt) {
29140 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29141 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29142 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29143 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29144 } else {
29145 // The SSE2 shifts use the lower i64 as the same shift amount for
29146 // all lanes and the upper i64 is ignored. On AVX we're better off
29147 // just zero-extending, but for SSE just duplicating the top 16-bits is
29148 // cheaper and has the same effect for out of range values.
29149 if (Subtarget.hasAVX()) {
29150 SDValue Z = DAG.getConstant(0, dl, VT);
29151 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29152 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29153 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29154 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29155 } else {
29156 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29157 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29158 {4, 5, 6, 7, -1, -1, -1, -1});
29159 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29160 {0, 1, 1, 1, -1, -1, -1, -1});
29161 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29162 {2, 3, 3, 3, -1, -1, -1, -1});
29163 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
29164 {0, 1, 1, 1, -1, -1, -1, -1});
29165 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
29166 {2, 3, 3, 3, -1, -1, -1, -1});
29167 }
29168 }
29169
29170 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29171 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29172 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29173 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29174 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29175
29176 // Merge the shifted lane results optimally with/without PBLENDW.
29177 // TODO - ideally shuffle combining would handle this.
29178 if (Subtarget.hasSSE41()) {
29179 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29180 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29181 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29182 }
29183 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29184 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29185 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29186 }
29187
29188 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29189 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29190 // make the existing SSE solution better.
29191 // NOTE: We honor prefered vector width before promoting to 512-bits.
29192 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29193 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29194 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29195 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29196 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29197 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29198, __extension__ __PRETTY_FUNCTION__))
29198 "Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29198, __extension__ __PRETTY_FUNCTION__))
;
29199 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29200 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29201 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29202 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29203 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29204 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29205 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29206 }
29207
29208 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29209 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29210 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29211 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29212 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29213 !Subtarget.hasXOP()) {
29214 int NumElts = VT.getVectorNumElements();
29215 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29216
29217 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29218 // isn't legal).
29219 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29220 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29221 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29222 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29223 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29224, __extension__ __PRETTY_FUNCTION__))
29224 "Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29224, __extension__ __PRETTY_FUNCTION__))
;
29225
29226 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29227 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
29228 : DAG.getZExtOrTrunc(R, dl, ExVT);
29229 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29230 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29231 return DAG.getZExtOrTrunc(R, dl, VT);
29232 }
29233
29234 SmallVector<SDValue, 16> LoAmt, HiAmt;
29235 for (int i = 0; i != NumElts; i += 16) {
29236 for (int j = 0; j != 8; ++j) {
29237 LoAmt.push_back(Amt.getOperand(i + j));
29238 HiAmt.push_back(Amt.getOperand(i + j + 8));
29239 }
29240 }
29241
29242 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29243 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29244 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29245
29246 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29247 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29248 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29249 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29250 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29251 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29252 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29253 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29254 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29255 }
29256
29257 if (VT == MVT::v16i8 ||
29258 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29259 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29260 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29261
29262 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29263 if (VT.is512BitVector()) {
29264 // On AVX512BW targets we make use of the fact that VSELECT lowers
29265 // to a masked blend which selects bytes based just on the sign bit
29266 // extracted to a mask.
29267 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29268 V0 = DAG.getBitcast(VT, V0);
29269 V1 = DAG.getBitcast(VT, V1);
29270 Sel = DAG.getBitcast(VT, Sel);
29271 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29272 ISD::SETGT);
29273 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29274 } else if (Subtarget.hasSSE41()) {
29275 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29276 // on the sign bit.
29277 V0 = DAG.getBitcast(VT, V0);
29278 V1 = DAG.getBitcast(VT, V1);
29279 Sel = DAG.getBitcast(VT, Sel);
29280 return DAG.getBitcast(SelVT,
29281 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29282 }
29283 // On pre-SSE41 targets we test for the sign bit by comparing to
29284 // zero - a negative value will set all bits of the lanes to true
29285 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29286 SDValue Z = DAG.getConstant(0, dl, SelVT);
29287 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29288 return DAG.getSelect(dl, SelVT, C, V0, V1);
29289 };
29290
29291 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29292 // We can safely do this using i16 shifts as we're only interested in
29293 // the 3 lower bits of each byte.
29294 Amt = DAG.getBitcast(ExtVT, Amt);
29295 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29296 Amt = DAG.getBitcast(VT, Amt);
29297
29298 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29299 // r = VSELECT(r, shift(r, 4), a);
29300 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29301 R = SignBitSelect(VT, Amt, M, R);
29302
29303 // a += a
29304 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29305
29306 // r = VSELECT(r, shift(r, 2), a);
29307 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29308 R = SignBitSelect(VT, Amt, M, R);
29309
29310 // a += a
29311 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29312
29313 // return VSELECT(r, shift(r, 1), a);
29314 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29315 R = SignBitSelect(VT, Amt, M, R);
29316 return R;
29317 }
29318
29319 if (Opc == ISD::SRA) {
29320 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29321 // so we can correctly sign extend. We don't care what happens to the
29322 // lower byte.
29323 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29324 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29325 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29326 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29327 ALo = DAG.getBitcast(ExtVT, ALo);
29328 AHi = DAG.getBitcast(ExtVT, AHi);
29329 RLo = DAG.getBitcast(ExtVT, RLo);
29330 RHi = DAG.getBitcast(ExtVT, RHi);
29331
29332 // r = VSELECT(r, shift(r, 4), a);
29333 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29334 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29335 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29336 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29337
29338 // a += a
29339 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29340 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29341
29342 // r = VSELECT(r, shift(r, 2), a);
29343 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29344 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29345 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29346 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29347
29348 // a += a
29349 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29350 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29351
29352 // r = VSELECT(r, shift(r, 1), a);
29353 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29354 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29355 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29356 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29357
29358 // Logical shift the result back to the lower byte, leaving a zero upper
29359 // byte meaning that we can safely pack with PACKUSWB.
29360 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29361 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29362 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29363 }
29364 }
29365
29366 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29367 MVT ExtVT = MVT::v8i32;
29368 SDValue Z = DAG.getConstant(0, dl, VT);
29369 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29370 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29371 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29372 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29373 ALo = DAG.getBitcast(ExtVT, ALo);
29374 AHi = DAG.getBitcast(ExtVT, AHi);
29375 RLo = DAG.getBitcast(ExtVT, RLo);
29376 RHi = DAG.getBitcast(ExtVT, RHi);
29377 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29378 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29379 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29380 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29381 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29382 }
29383
29384 if (VT == MVT::v8i16) {
29385 // If we have a constant shift amount, the non-SSE41 path is best as
29386 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29387 bool UseSSE41 = Subtarget.hasSSE41() &&
29388 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29389
29390 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29391 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29392 // the sign bit.
29393 if (UseSSE41) {
29394 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29395 V0 = DAG.getBitcast(ExtVT, V0);
29396 V1 = DAG.getBitcast(ExtVT, V1);
29397 Sel = DAG.getBitcast(ExtVT, Sel);
29398 return DAG.getBitcast(
29399 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29400 }
29401 // On pre-SSE41 targets we splat the sign bit - a negative value will
29402 // set all bits of the lanes to true and VSELECT uses that in
29403 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29404 SDValue C =
29405 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29406 return DAG.getSelect(dl, VT, C, V0, V1);
29407 };
29408
29409 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29410 if (UseSSE41) {
29411 // On SSE41 targets we need to replicate the shift mask in both
29412 // bytes for PBLENDVB.
29413 Amt = DAG.getNode(
29414 ISD::OR, dl, VT,
29415 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29416 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29417 } else {
29418 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29419 }
29420
29421 // r = VSELECT(r, shift(r, 8), a);
29422 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29423 R = SignBitSelect(Amt, M, R);
29424
29425 // a += a
29426 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29427
29428 // r = VSELECT(r, shift(r, 4), a);
29429 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29430 R = SignBitSelect(Amt, M, R);
29431
29432 // a += a
29433 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29434
29435 // r = VSELECT(r, shift(r, 2), a);
29436 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29437 R = SignBitSelect(Amt, M, R);
29438
29439 // a += a
29440 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29441
29442 // return VSELECT(r, shift(r, 1), a);
29443 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29444 R = SignBitSelect(Amt, M, R);
29445 return R;
29446 }
29447
29448 // Decompose 256-bit shifts into 128-bit shifts.
29449 if (VT.is256BitVector())
29450 return splitVectorIntBinary(Op, DAG);
29451
29452 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29453 return splitVectorIntBinary(Op, DAG);
29454
29455 return SDValue();
29456}
29457
29458static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
29459 SelectionDAG &DAG) {
29460 MVT VT = Op.getSimpleValueType();
29461 assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29461, __extension__ __PRETTY_FUNCTION__))
;
29462
29463 SDLoc DL(Op);
29464 SDValue R = Op.getOperand(0);
29465 SDValue Amt = Op.getOperand(1);
29466 unsigned Opcode = Op.getOpcode();
29467 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29468 int NumElts = VT.getVectorNumElements();
29469
29470 // Check for constant splat rotation amount.
29471 APInt CstSplatValue;
29472 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
29473
29474 // Check for splat rotate by zero.
29475 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
29476 return R;
29477
29478 // AVX512 implicitly uses modulo rotation amounts.
29479 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
29480 // Attempt to rotate by immediate.
29481 if (IsCstSplat) {
29482 unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
29483 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
29484 return DAG.getNode(RotOpc, DL, VT, R,
29485 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
29486 }
29487
29488 // Else, fall-back on VPROLV/VPRORV.
29489 return Op;
29490 }
29491
29492 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
29493 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
29494 unsigned FunnelOpc = (Opcode == ISD::ROTL ? ISD::FSHL : ISD::FSHR);
29495 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
29496 }
29497
29498 assert((Opcode == ISD::ROTL) && "Only ROTL supported")(static_cast <bool> ((Opcode == ISD::ROTL) && "Only ROTL supported"
) ? void (0) : __assert_fail ("(Opcode == ISD::ROTL) && \"Only ROTL supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29498, __extension__ __PRETTY_FUNCTION__))
;
29499
29500 // XOP has 128-bit vector variable + immediate rotates.
29501 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
29502 // XOP implicitly uses modulo rotation amounts.
29503 if (Subtarget.hasXOP()) {
29504 if (VT.is256BitVector())
29505 return splitVectorIntBinary(Op, DAG);
29506 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29506, __extension__ __PRETTY_FUNCTION__))
;
29507
29508 // Attempt to rotate by immediate.
29509 if (IsCstSplat) {
29510 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
29511 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
29512 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
29513 }
29514
29515 // Use general rotate by variable (per-element).
29516 return Op;
29517 }
29518
29519 // Split 256-bit integers on pre-AVX2 targets.
29520 if (VT.is256BitVector() && !Subtarget.hasAVX2())
29521 return splitVectorIntBinary(Op, DAG);
29522
29523 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget
.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29527, __extension__ __PRETTY_FUNCTION__))
29524 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget
.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29527, __extension__ __PRETTY_FUNCTION__))
29525 VT == MVT::v32i16) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget
.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29527, __extension__ __PRETTY_FUNCTION__))
29526 Subtarget.hasAVX2())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget
.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29527, __extension__ __PRETTY_FUNCTION__))
29527 "Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget
.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || VT == MVT::v32i16) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29527, __extension__ __PRETTY_FUNCTION__))
;
29528
29529 // Rotate by an uniform constant - expand back to shifts.
29530 if (IsCstSplat)
29531 return SDValue();
29532
29533 bool IsSplatAmt = DAG.isSplatValue(Amt);
29534
29535 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
29536 // the amount bit.
29537 if (EltSizeInBits == 8 && !IsSplatAmt) {
29538 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
29539 return SDValue();
29540
29541 // We don't need ModuloAmt here as we just peek at individual bits.
29542 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29543
29544 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29545 if (Subtarget.hasSSE41()) {
29546 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29547 // on the sign bit.
29548 V0 = DAG.getBitcast(VT, V0);
29549 V1 = DAG.getBitcast(VT, V1);
29550 Sel = DAG.getBitcast(VT, Sel);
29551 return DAG.getBitcast(SelVT,
29552 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
29553 }
29554 // On pre-SSE41 targets we test for the sign bit by comparing to
29555 // zero - a negative value will set all bits of the lanes to true
29556 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29557 SDValue Z = DAG.getConstant(0, DL, SelVT);
29558 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
29559 return DAG.getSelect(DL, SelVT, C, V0, V1);
29560 };
29561
29562 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29563 // We can safely do this using i16 shifts as we're only interested in
29564 // the 3 lower bits of each byte.
29565 Amt = DAG.getBitcast(ExtVT, Amt);
29566 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
29567 Amt = DAG.getBitcast(VT, Amt);
29568
29569 // r = VSELECT(r, rot(r, 4), a);
29570 SDValue M;
29571 M = DAG.getNode(
29572 ISD::OR, DL, VT,
29573 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
29574 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
29575 R = SignBitSelect(VT, Amt, M, R);
29576
29577 // a += a
29578 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29579
29580 // r = VSELECT(r, rot(r, 2), a);
29581 M = DAG.getNode(
29582 ISD::OR, DL, VT,
29583 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
29584 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
29585 R = SignBitSelect(VT, Amt, M, R);
29586
29587 // a += a
29588 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
29589
29590 // return VSELECT(r, rot(r, 1), a);
29591 M = DAG.getNode(
29592 ISD::OR, DL, VT,
29593 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
29594 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
29595 return SignBitSelect(VT, Amt, M, R);
29596 }
29597
29598 // ISD::ROT* uses modulo rotate amounts.
29599 if (SDValue BaseRotAmt = DAG.getSplatValue(Amt)) {
29600 // If the amount is a splat, perform the modulo BEFORE the splat,
29601 // this helps LowerScalarVariableShift to remove the splat later.
29602 Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt);
29603 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29604 DAG.getConstant(EltSizeInBits - 1, DL, VT));
29605 Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT),
29606 SmallVector<int>(NumElts, 0));
29607 } else {
29608 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
29609 DAG.getConstant(EltSizeInBits - 1, DL, VT));
29610 }
29611
29612 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29613 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
29614 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
29615
29616 // Fallback for splats + all supported variable shifts.
29617 // Fallback for non-constants AVX2 vXi16 as well.
29618 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
29619 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
29620 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
29621 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
29622 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
29623 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
29624 }
29625
29626 // As with shifts, convert the rotation amount to a multiplication factor.
29627 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
29628 assert(Scale && "Failed to convert ROTL amount to scale")(static_cast <bool> (Scale && "Failed to convert ROTL amount to scale"
) ? void (0) : __assert_fail ("Scale && \"Failed to convert ROTL amount to scale\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29628, __extension__ __PRETTY_FUNCTION__))
;
29629
29630 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
29631 if (EltSizeInBits == 16) {
29632 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
29633 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
29634 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
29635 }
29636
29637 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
29638 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
29639 // that can then be OR'd with the lower 32-bits.
29640 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29640, __extension__ __PRETTY_FUNCTION__))
;
29641 static const int OddMask[] = {1, -1, 3, -1};
29642 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
29643 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
29644
29645 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29646 DAG.getBitcast(MVT::v2i64, R),
29647 DAG.getBitcast(MVT::v2i64, Scale));
29648 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
29649 DAG.getBitcast(MVT::v2i64, R13),
29650 DAG.getBitcast(MVT::v2i64, Scale13));
29651 Res02 = DAG.getBitcast(VT, Res02);
29652 Res13 = DAG.getBitcast(VT, Res13);
29653
29654 return DAG.getNode(ISD::OR, DL, VT,
29655 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
29656 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
29657}
29658
29659/// Returns true if the operand type is exactly twice the native width, and
29660/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
29661/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
29662/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
29663bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
29664 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
29665
29666 if (OpWidth == 64)
29667 return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
29668 if (OpWidth == 128)
29669 return Subtarget.hasCmpxchg16b();
29670
29671 return false;
29672}
29673
29674bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
29675 Type *MemType = SI->getValueOperand()->getType();
29676
29677 bool NoImplicitFloatOps =
29678 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29679 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29680 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29681 (Subtarget.hasSSE1() || Subtarget.hasX87()))
29682 return false;
29683
29684 return needsCmpXchgNb(MemType);
29685}
29686
29687// Note: this turns large loads into lock cmpxchg8b/16b.
29688// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
29689TargetLowering::AtomicExpansionKind
29690X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
29691 Type *MemType = LI->getType();
29692
29693 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
29694 // can use movq to do the load. If we have X87 we can load into an 80-bit
29695 // X87 register and store it to a stack temporary.
29696 bool NoImplicitFloatOps =
29697 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
29698 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
29699 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
29700 (Subtarget.hasSSE1() || Subtarget.hasX87()))
29701 return AtomicExpansionKind::None;
29702
29703 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29704 : AtomicExpansionKind::None;
29705}
29706
29707TargetLowering::AtomicExpansionKind
29708X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
29709 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29710 Type *MemType = AI->getType();
29711
29712 // If the operand is too big, we must see if cmpxchg8/16b is available
29713 // and default to library calls otherwise.
29714 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
29715 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
29716 : AtomicExpansionKind::None;
29717 }
29718
29719 AtomicRMWInst::BinOp Op = AI->getOperation();
29720 switch (Op) {
29721 default:
29722 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29722)
;
29723 case AtomicRMWInst::Xchg:
29724 case AtomicRMWInst::Add:
29725 case AtomicRMWInst::Sub:
29726 // It's better to use xadd, xsub or xchg for these in all cases.
29727 return AtomicExpansionKind::None;
29728 case AtomicRMWInst::Or:
29729 case AtomicRMWInst::And:
29730 case AtomicRMWInst::Xor:
29731 // If the atomicrmw's result isn't actually used, we can just add a "lock"
29732 // prefix to a normal instruction for these operations.
29733 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
29734 : AtomicExpansionKind::None;
29735 case AtomicRMWInst::Nand:
29736 case AtomicRMWInst::Max:
29737 case AtomicRMWInst::Min:
29738 case AtomicRMWInst::UMax:
29739 case AtomicRMWInst::UMin:
29740 case AtomicRMWInst::FAdd:
29741 case AtomicRMWInst::FSub:
29742 // These always require a non-trivial set of data operations on x86. We must
29743 // use a cmpxchg loop.
29744 return AtomicExpansionKind::CmpXChg;
29745 }
29746}
29747
29748LoadInst *
29749X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
29750 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
29751 Type *MemType = AI->getType();
29752 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
29753 // there is no benefit in turning such RMWs into loads, and it is actually
29754 // harmful as it introduces a mfence.
29755 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
29756 return nullptr;
29757
29758 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
29759 // lowering available in lowerAtomicArith.
29760 // TODO: push more cases through this path.
29761 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
29762 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
29763 AI->use_empty())
29764 return nullptr;
29765
29766 IRBuilder<> Builder(AI);
29767 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
29768 auto SSID = AI->getSyncScopeID();
29769 // We must restrict the ordering to avoid generating loads with Release or
29770 // ReleaseAcquire orderings.
29771 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
29772
29773 // Before the load we need a fence. Here is an example lifted from
29774 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
29775 // is required:
29776 // Thread 0:
29777 // x.store(1, relaxed);
29778 // r1 = y.fetch_add(0, release);
29779 // Thread 1:
29780 // y.fetch_add(42, acquire);
29781 // r2 = x.load(relaxed);
29782 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
29783 // lowered to just a load without a fence. A mfence flushes the store buffer,
29784 // making the optimization clearly correct.
29785 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
29786 // otherwise, we might be able to be more aggressive on relaxed idempotent
29787 // rmw. In practice, they do not look useful, so we don't try to be
29788 // especially clever.
29789 if (SSID == SyncScope::SingleThread)
29790 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
29791 // the IR level, so we must wrap it in an intrinsic.
29792 return nullptr;
29793
29794 if (!Subtarget.hasMFence())
29795 // FIXME: it might make sense to use a locked operation here but on a
29796 // different cache-line to prevent cache-line bouncing. In practice it
29797 // is probably a small win, and x86 processors without mfence are rare
29798 // enough that we do not bother.
29799 return nullptr;
29800
29801 Function *MFence =
29802 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
29803 Builder.CreateCall(MFence, {});
29804
29805 // Finally we can emit the atomic load.
29806 LoadInst *Loaded = Builder.CreateAlignedLoad(
29807 AI->getType(), AI->getPointerOperand(), AI->getAlign());
29808 Loaded->setAtomic(Order, SSID);
29809 AI->replaceAllUsesWith(Loaded);
29810 AI->eraseFromParent();
29811 return Loaded;
29812}
29813
29814bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
29815 if (!SI.isUnordered())
29816 return false;
29817 return ExperimentalUnorderedISEL;
29818}
29819bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
29820 if (!LI.isUnordered())
29821 return false;
29822 return ExperimentalUnorderedISEL;
29823}
29824
29825
29826/// Emit a locked operation on a stack location which does not change any
29827/// memory location, but does involve a lock prefix. Location is chosen to be
29828/// a) very likely accessed only by a single thread to minimize cache traffic,
29829/// and b) definitely dereferenceable. Returns the new Chain result.
29830static SDValue emitLockedStackOp(SelectionDAG &DAG,
29831 const X86Subtarget &Subtarget, SDValue Chain,
29832 const SDLoc &DL) {
29833 // Implementation notes:
29834 // 1) LOCK prefix creates a full read/write reordering barrier for memory
29835 // operations issued by the current processor. As such, the location
29836 // referenced is not relevant for the ordering properties of the instruction.
29837 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
29838 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
29839 // 2) Using an immediate operand appears to be the best encoding choice
29840 // here since it doesn't require an extra register.
29841 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
29842 // is small enough it might just be measurement noise.)
29843 // 4) When choosing offsets, there are several contributing factors:
29844 // a) If there's no redzone, we default to TOS. (We could allocate a cache
29845 // line aligned stack object to improve this case.)
29846 // b) To minimize our chances of introducing a false dependence, we prefer
29847 // to offset the stack usage from TOS slightly.
29848 // c) To minimize concerns about cross thread stack usage - in particular,
29849 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
29850 // captures state in the TOS frame and accesses it from many threads -
29851 // we want to use an offset such that the offset is in a distinct cache
29852 // line from the TOS frame.
29853 //
29854 // For a general discussion of the tradeoffs and benchmark results, see:
29855 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
29856
29857 auto &MF = DAG.getMachineFunction();
29858 auto &TFL = *Subtarget.getFrameLowering();
29859 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
29860
29861 if (Subtarget.is64Bit()) {
29862 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29863 SDValue Ops[] = {
29864 DAG.getRegister(X86::RSP, MVT::i64), // Base
29865 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
29866 DAG.getRegister(0, MVT::i64), // Index
29867 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
29868 DAG.getRegister(0, MVT::i16), // Segment.
29869 Zero,
29870 Chain};
29871 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29872 MVT::Other, Ops);
29873 return SDValue(Res, 1);
29874 }
29875
29876 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
29877 SDValue Ops[] = {
29878 DAG.getRegister(X86::ESP, MVT::i32), // Base
29879 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
29880 DAG.getRegister(0, MVT::i32), // Index
29881 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
29882 DAG.getRegister(0, MVT::i16), // Segment.
29883 Zero,
29884 Chain
29885 };
29886 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
29887 MVT::Other, Ops);
29888 return SDValue(Res, 1);
29889}
29890
29891static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
29892 SelectionDAG &DAG) {
29893 SDLoc dl(Op);
29894 AtomicOrdering FenceOrdering =
29895 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
29896 SyncScope::ID FenceSSID =
29897 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
29898
29899 // The only fence that needs an instruction is a sequentially-consistent
29900 // cross-thread fence.
29901 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
29902 FenceSSID == SyncScope::System) {
29903 if (Subtarget.hasMFence())
29904 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
29905
29906 SDValue Chain = Op.getOperand(0);
29907 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
29908 }
29909
29910 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
29911 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
29912}
29913
29914static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
29915 SelectionDAG &DAG) {
29916 MVT T = Op.getSimpleValueType();
29917 SDLoc DL(Op);
29918 unsigned Reg = 0;
29919 unsigned size = 0;
29920 switch(T.SimpleTy) {
29921 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29921)
;
29922 case MVT::i8: Reg = X86::AL; size = 1; break;
29923 case MVT::i16: Reg = X86::AX; size = 2; break;
29924 case MVT::i32: Reg = X86::EAX; size = 4; break;
29925 case MVT::i64:
29926 assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29926, __extension__ __PRETTY_FUNCTION__))
;
29927 Reg = X86::RAX; size = 8;
29928 break;
29929 }
29930 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
29931 Op.getOperand(2), SDValue());
29932 SDValue Ops[] = { cpIn.getValue(0),
29933 Op.getOperand(1),
29934 Op.getOperand(3),
29935 DAG.getTargetConstant(size, DL, MVT::i8),
29936 cpIn.getValue(1) };
29937 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29938 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
29939 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
29940 Ops, T, MMO);
29941
29942 SDValue cpOut =
29943 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
29944 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
29945 MVT::i32, cpOut.getValue(2));
29946 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
29947
29948 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
29949 cpOut, Success, EFLAGS.getValue(1));
29950}
29951
29952// Create MOVMSKB, taking into account whether we need to split for AVX1.
29953static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
29954 const X86Subtarget &Subtarget) {
29955 MVT InVT = V.getSimpleValueType();
29956
29957 if (InVT == MVT::v64i8) {
29958 SDValue Lo, Hi;
29959 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29960 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
29961 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
29962 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
29963 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
29964 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
29965 DAG.getConstant(32, DL, MVT::i8));
29966 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
29967 }
29968 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
29969 SDValue Lo, Hi;
29970 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
29971 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
29972 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
29973 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
29974 DAG.getConstant(16, DL, MVT::i8));
29975 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
29976 }
29977
29978 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29979}
29980
29981static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
29982 SelectionDAG &DAG) {
29983 SDValue Src = Op.getOperand(0);
29984 MVT SrcVT = Src.getSimpleValueType();
29985 MVT DstVT = Op.getSimpleValueType();
29986
29987 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
29988 // half to v32i1 and concatenating the result.
29989 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
29990 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29990, __extension__ __PRETTY_FUNCTION__))
;
29991 assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29991, __extension__ __PRETTY_FUNCTION__))
;
29992 SDLoc dl(Op);
29993 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29994 DAG.getIntPtrConstant(0, dl));
29995 Lo = DAG.getBitcast(MVT::v32i1, Lo);
29996 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
29997 DAG.getIntPtrConstant(1, dl));
29998 Hi = DAG.getBitcast(MVT::v32i1, Hi);
29999 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
30000 }
30001
30002 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
30003 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
30004 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30004, __extension__ __PRETTY_FUNCTION__))
;
30005 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
30006 SDLoc DL(Op);
30007 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
30008 V = getPMOVMSKB(DL, V, DAG, Subtarget);
30009 return DAG.getZExtOrTrunc(V, DL, DstVT);
30010 }
30011
30012 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30013, __extension__ __PRETTY_FUNCTION__))
30013 SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30013, __extension__ __PRETTY_FUNCTION__))
;
30014
30015 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30015, __extension__ __PRETTY_FUNCTION__))
;
30016 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
30017 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
30018 // This conversion needs to be expanded.
30019 return SDValue();
30020
30021 SDLoc dl(Op);
30022 if (SrcVT.isVector()) {
30023 // Widen the vector in input in the case of MVT::v2i32.
30024 // Example: from MVT::v2i32 to MVT::v4i32.
30025 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
30026 SrcVT.getVectorNumElements() * 2);
30027 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
30028 DAG.getUNDEF(SrcVT));
30029 } else {
30030 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30031, __extension__ __PRETTY_FUNCTION__))
30031 "Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30031, __extension__ __PRETTY_FUNCTION__))
;
30032 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
30033 }
30034
30035 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
30036 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
30037
30038 if (DstVT == MVT::x86mmx)
30039 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
30040
30041 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
30042 DAG.getIntPtrConstant(0, dl));
30043}
30044
30045/// Compute the horizontal sum of bytes in V for the elements of VT.
30046///
30047/// Requires V to be a byte vector and VT to be an integer vector type with
30048/// wider elements than V's type. The width of the elements of VT determines
30049/// how many bytes of V are summed horizontally to produce each element of the
30050/// result.
30051static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
30052 const X86Subtarget &Subtarget,
30053 SelectionDAG &DAG) {
30054 SDLoc DL(V);
30055 MVT ByteVecVT = V.getSimpleValueType();
30056 MVT EltVT = VT.getVectorElementType();
30057 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30058, __extension__ __PRETTY_FUNCTION__))
30058 "Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30058, __extension__ __PRETTY_FUNCTION__))
;
30059 assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30060, __extension__ __PRETTY_FUNCTION__))
30060 "Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30060, __extension__ __PRETTY_FUNCTION__))
;
30061 unsigned VecSize = VT.getSizeInBits();
30062 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30062, __extension__ __PRETTY_FUNCTION__))
;
30063
30064 // PSADBW instruction horizontally add all bytes and leave the result in i64
30065 // chunks, thus directly computes the pop count for v2i64 and v4i64.
30066 if (EltVT == MVT::i64) {
30067 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
30068 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30069 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
30070 return DAG.getBitcast(VT, V);
30071 }
30072
30073 if (EltVT == MVT::i32) {
30074 // We unpack the low half and high half into i32s interleaved with zeros so
30075 // that we can use PSADBW to horizontally sum them. The most useful part of
30076 // this is that it lines up the results of two PSADBW instructions to be
30077 // two v2i64 vectors which concatenated are the 4 population counts. We can
30078 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
30079 SDValue Zeros = DAG.getConstant(0, DL, VT);
30080 SDValue V32 = DAG.getBitcast(VT, V);
30081 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
30082 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
30083
30084 // Do the horizontal sums into two v2i64s.
30085 Zeros = DAG.getConstant(0, DL, ByteVecVT);
30086 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
30087 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30088 DAG.getBitcast(ByteVecVT, Low), Zeros);
30089 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
30090 DAG.getBitcast(ByteVecVT, High), Zeros);
30091
30092 // Merge them together.
30093 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
30094 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
30095 DAG.getBitcast(ShortVecVT, Low),
30096 DAG.getBitcast(ShortVecVT, High));
30097
30098 return DAG.getBitcast(VT, V);
30099 }
30100
30101 // The only element type left is i16.
30102 assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30102, __extension__ __PRETTY_FUNCTION__))
;
30103
30104 // To obtain pop count for each i16 element starting from the pop count for
30105 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
30106 // right by 8. It is important to shift as i16s as i8 vector shift isn't
30107 // directly supported.
30108 SDValue ShifterV = DAG.getConstant(8, DL, VT);
30109 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30110 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
30111 DAG.getBitcast(ByteVecVT, V));
30112 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
30113}
30114
30115static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
30116 const X86Subtarget &Subtarget,
30117 SelectionDAG &DAG) {
30118 MVT VT = Op.getSimpleValueType();
30119 MVT EltVT = VT.getVectorElementType();
30120 int NumElts = VT.getVectorNumElements();
30121 (void)EltVT;
30122 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30122, __extension__ __PRETTY_FUNCTION__))
;
30123
30124 // Implement a lookup table in register by using an algorithm based on:
30125 // http://wm.ite.pl/articles/sse-popcount.html
30126 //
30127 // The general idea is that every lower byte nibble in the input vector is an
30128 // index into a in-register pre-computed pop count table. We then split up the
30129 // input vector in two new ones: (1) a vector with only the shifted-right
30130 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
30131 // masked out higher ones) for each byte. PSHUFB is used separately with both
30132 // to index the in-register table. Next, both are added and the result is a
30133 // i8 vector where each element contains the pop count for input byte.
30134 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
30135 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
30136 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
30137 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
30138
30139 SmallVector<SDValue, 64> LUTVec;
30140 for (int i = 0; i < NumElts; ++i)
30141 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
30142 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
30143 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
30144
30145 // High nibbles
30146 SDValue FourV = DAG.getConstant(4, DL, VT);
30147 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
30148
30149 // Low nibbles
30150 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
30151
30152 // The input vector is used as the shuffle mask that index elements into the
30153 // LUT. After counting low and high nibbles, add the vector to obtain the
30154 // final pop count per i8 element.
30155 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
30156 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
30157 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
30158}
30159
30160// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
30161// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
30162static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30163 SelectionDAG &DAG) {
30164 MVT VT = Op.getSimpleValueType();
30165 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30166, __extension__ __PRETTY_FUNCTION__))
30166 "Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30166, __extension__ __PRETTY_FUNCTION__))
;
30167 SDLoc DL(Op.getNode());
30168 SDValue Op0 = Op.getOperand(0);
30169
30170 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
30171 if (Subtarget.hasVPOPCNTDQ()) {
30172 unsigned NumElems = VT.getVectorNumElements();
30173 assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30174, __extension__ __PRETTY_FUNCTION__))
30174 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30174, __extension__ __PRETTY_FUNCTION__))
;
30175 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
30176 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
30177 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
30178 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
30179 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
30180 }
30181 }
30182
30183 // Decompose 256-bit ops into smaller 128-bit ops.
30184 if (VT.is256BitVector() && !Subtarget.hasInt256())
30185 return splitVectorIntUnary(Op, DAG);
30186
30187 // Decompose 512-bit ops into smaller 256-bit ops.
30188 if (VT.is512BitVector() && !Subtarget.hasBWI())
30189 return splitVectorIntUnary(Op, DAG);
30190
30191 // For element types greater than i8, do vXi8 pop counts and a bytesum.
30192 if (VT.getScalarType() != MVT::i8) {
30193 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
30194 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
30195 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
30196 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
30197 }
30198
30199 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
30200 if (!Subtarget.hasSSSE3())
30201 return SDValue();
30202
30203 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
30204}
30205
30206static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
30207 SelectionDAG &DAG) {
30208 assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30209, __extension__ __PRETTY_FUNCTION__))
30209 "We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30209, __extension__ __PRETTY_FUNCTION__))
;
30210 return LowerVectorCTPOP(Op, Subtarget, DAG);
30211}
30212
30213static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
30214 MVT VT = Op.getSimpleValueType();
30215 SDValue In = Op.getOperand(0);
30216 SDLoc DL(Op);
30217
30218 // For scalars, its still beneficial to transfer to/from the SIMD unit to
30219 // perform the BITREVERSE.
30220 if (!VT.isVector()) {
30221 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
30222 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
30223 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
30224 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
30225 DAG.getIntPtrConstant(0, DL));
30226 }
30227
30228 int NumElts = VT.getVectorNumElements();
30229 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
30230
30231 // Decompose 256-bit ops into smaller 128-bit ops.
30232 if (VT.is256BitVector())
30233 return splitVectorIntUnary(Op, DAG);
30234
30235 assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30236, __extension__ __PRETTY_FUNCTION__))
30236 "Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30236, __extension__ __PRETTY_FUNCTION__))
;
30237
30238 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
30239 // perform the BSWAP in the shuffle.
30240 // Its best to shuffle using the second operand as this will implicitly allow
30241 // memory folding for multiple vectors.
30242 SmallVector<SDValue, 16> MaskElts;
30243 for (int i = 0; i != NumElts; ++i) {
30244 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
30245 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
30246 int PermuteByte = SourceByte | (2 << 5);
30247 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
30248 }
30249 }
30250
30251 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
30252 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
30253 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
30254 Res, Mask);
30255 return DAG.getBitcast(VT, Res);
30256}
30257
30258static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
30259 SelectionDAG &DAG) {
30260 MVT VT = Op.getSimpleValueType();
30261
30262 if (Subtarget.hasXOP() && !VT.is512BitVector())
30263 return LowerBITREVERSE_XOP(Op, DAG);
30264
30265 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30265, __extension__ __PRETTY_FUNCTION__))
;
30266
30267 SDValue In = Op.getOperand(0);
30268 SDLoc DL(Op);
30269
30270 assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30271, __extension__ __PRETTY_FUNCTION__))
30271 "Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30271, __extension__ __PRETTY_FUNCTION__))
;
30272
30273 // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
30274 if (VT == MVT::v64i8 && !Subtarget.hasBWI())
30275 return splitVectorIntUnary(Op, DAG);
30276
30277 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
30278 if (VT == MVT::v32i8 && !Subtarget.hasInt256())
30279 return splitVectorIntUnary(Op, DAG);
30280
30281 unsigned NumElts = VT.getVectorNumElements();
30282
30283 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
30284 if (Subtarget.hasGFNI()) {
30285 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
30286 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
30287 Matrix = DAG.getBitcast(VT, Matrix);
30288 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
30289 DAG.getTargetConstant(0, DL, MVT::i8));
30290 }
30291
30292 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
30293 // two nibbles and a PSHUFB lookup to find the bitreverse of each
30294 // 0-15 value (moved to the other nibble).
30295 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
30296 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
30297 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
30298
30299 const int LoLUT[16] = {
30300 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
30301 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
30302 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
30303 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
30304 const int HiLUT[16] = {
30305 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
30306 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
30307 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
30308 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
30309
30310 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
30311 for (unsigned i = 0; i < NumElts; ++i) {
30312 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
30313 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
30314 }
30315
30316 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
30317 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
30318 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
30319 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
30320 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30321}
30322
30323static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
30324 SelectionDAG &DAG) {
30325 SDLoc DL(Op);
30326 SDValue X = Op.getOperand(0);
30327 MVT VT = Op.getSimpleValueType();
30328
30329 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
30330 if (VT == MVT::i8 ||
30331 DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {
30332 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
30333 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
30334 DAG.getConstant(0, DL, MVT::i8));
30335 // Copy the inverse of the parity flag into a register with setcc.
30336 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
30337 // Extend to the original type.
30338 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
30339 }
30340
30341 if (VT == MVT::i64) {
30342 // Xor the high and low 16-bits together using a 32-bit operation.
30343 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
30344 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
30345 DAG.getConstant(32, DL, MVT::i8)));
30346 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
30347 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
30348 }
30349
30350 if (VT != MVT::i16) {
30351 // Xor the high and low 16-bits together using a 32-bit operation.
30352 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
30353 DAG.getConstant(16, DL, MVT::i8));
30354 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
30355 } else {
30356 // If the input is 16-bits, we need to extend to use an i32 shift below.
30357 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
30358 }
30359
30360 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
30361 // This should allow an h-reg to be used to save a shift.
30362 SDValue Hi = DAG.getNode(
30363 ISD::TRUNCATE, DL, MVT::i8,
30364 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
30365 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
30366 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
30367 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
30368
30369 // Copy the inverse of the parity flag into a register with setcc.
30370 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
30371 // Extend to the original type.
30372 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
30373}
30374
30375static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
30376 const X86Subtarget &Subtarget) {
30377 unsigned NewOpc = 0;
30378 switch (N->getOpcode()) {
30379 case ISD::ATOMIC_LOAD_ADD:
30380 NewOpc = X86ISD::LADD;
30381 break;
30382 case ISD::ATOMIC_LOAD_SUB:
30383 NewOpc = X86ISD::LSUB;
30384 break;
30385 case ISD::ATOMIC_LOAD_OR:
30386 NewOpc = X86ISD::LOR;
30387 break;
30388 case ISD::ATOMIC_LOAD_XOR:
30389 NewOpc = X86ISD::LXOR;
30390 break;
30391 case ISD::ATOMIC_LOAD_AND:
30392 NewOpc = X86ISD::LAND;
30393 break;
30394 default:
30395 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30395)
;
30396 }
30397
30398 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
30399
30400 return DAG.getMemIntrinsicNode(
30401 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
30402 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
30403 /*MemVT=*/N->getSimpleValueType(0), MMO);
30404}
30405
30406/// Lower atomic_load_ops into LOCK-prefixed operations.
30407static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
30408 const X86Subtarget &Subtarget) {
30409 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
30410 SDValue Chain = N->getOperand(0);
30411 SDValue LHS = N->getOperand(1);
30412 SDValue RHS = N->getOperand(2);
30413 unsigned Opc = N->getOpcode();
30414 MVT VT = N->getSimpleValueType(0);
30415 SDLoc DL(N);
30416
30417 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
30418 // can only be lowered when the result is unused. They should have already
30419 // been transformed into a cmpxchg loop in AtomicExpand.
30420 if (N->hasAnyUseOfValue(0)) {
30421 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
30422 // select LXADD if LOCK_SUB can't be selected.
30423 if (Opc == ISD::ATOMIC_LOAD_SUB) {
30424 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
30425 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
30426 RHS, AN->getMemOperand());
30427 }
30428 assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30429, __extension__ __PRETTY_FUNCTION__))
30429 "Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30429, __extension__ __PRETTY_FUNCTION__))
;
30430 return N;
30431 }
30432
30433 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
30434 // The core idea here is that since the memory location isn't actually
30435 // changing, all we need is a lowering for the *ordering* impacts of the
30436 // atomicrmw. As such, we can chose a different operation and memory
30437 // location to minimize impact on other code.
30438 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
30439 // On X86, the only ordering which actually requires an instruction is
30440 // seq_cst which isn't SingleThread, everything just needs to be preserved
30441 // during codegen and then dropped. Note that we expect (but don't assume),
30442 // that orderings other than seq_cst and acq_rel have been canonicalized to
30443 // a store or load.
30444 if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&
30445 AN->getSyncScopeID() == SyncScope::System) {
30446 // Prefer a locked operation against a stack location to minimize cache
30447 // traffic. This assumes that stack locations are very likely to be
30448 // accessed only by the owning thread.
30449 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
30450 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30450, __extension__ __PRETTY_FUNCTION__))
;
30451 // NOTE: The getUNDEF is needed to give something for the unused result 0.
30452 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
30453 DAG.getUNDEF(VT), NewChain);
30454 }
30455 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
30456 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
30457 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30457, __extension__ __PRETTY_FUNCTION__))
;
30458 // NOTE: The getUNDEF is needed to give something for the unused result 0.
30459 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
30460 DAG.getUNDEF(VT), NewChain);
30461 }
30462
30463 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
30464 // RAUW the chain, but don't worry about the result, as it's unused.
30465 assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30465, __extension__ __PRETTY_FUNCTION__))
;
30466 // NOTE: The getUNDEF is needed to give something for the unused result 0.
30467 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
30468 DAG.getUNDEF(VT), LockOp.getValue(1));
30469}
30470
30471static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
30472 const X86Subtarget &Subtarget) {
30473 auto *Node = cast<AtomicSDNode>(Op.getNode());
30474 SDLoc dl(Node);
30475 EVT VT = Node->getMemoryVT();
30476
30477 bool IsSeqCst =
30478 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
30479 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
30480
30481 // If this store is not sequentially consistent and the type is legal
30482 // we can just keep it.
30483 if (!IsSeqCst && IsTypeLegal)
30484 return Op;
30485
30486 if (VT == MVT::i64 && !IsTypeLegal) {
30487 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
30488 // is enabled.
30489 bool NoImplicitFloatOps =
30490 DAG.getMachineFunction().getFunction().hasFnAttribute(
30491 Attribute::NoImplicitFloat);
30492 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
30493 SDValue Chain;
30494 if (Subtarget.hasSSE1()) {
30495 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
30496 Node->getOperand(2));
30497 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
30498 SclToVec = DAG.getBitcast(StVT, SclToVec);
30499 SDVTList Tys = DAG.getVTList(MVT::Other);
30500 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
30501 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
30502 MVT::i64, Node->getMemOperand());
30503 } else if (Subtarget.hasX87()) {
30504 // First load this into an 80-bit X87 register using a stack temporary.
30505 // This will put the whole integer into the significand.
30506 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
30507 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30508 MachinePointerInfo MPI =
30509 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
30510 Chain =
30511 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
30512 MPI, MaybeAlign(), MachineMemOperand::MOStore);
30513 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
30514 SDValue LdOps[] = {Chain, StackPtr};
30515 SDValue Value =
30516 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
30517 /*Align*/ None, MachineMemOperand::MOLoad);
30518 Chain = Value.getValue(1);
30519
30520 // Now use an FIST to do the atomic store.
30521 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
30522 Chain =
30523 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
30524 StoreOps, MVT::i64, Node->getMemOperand());
30525 }
30526
30527 if (Chain) {
30528 // If this is a sequentially consistent store, also emit an appropriate
30529 // barrier.
30530 if (IsSeqCst)
30531 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
30532
30533 return Chain;
30534 }
30535 }
30536 }
30537
30538 // Convert seq_cst store -> xchg
30539 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
30540 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
30541 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
30542 Node->getMemoryVT(),
30543 Node->getOperand(0),
30544 Node->getOperand(1), Node->getOperand(2),
30545 Node->getMemOperand());
30546 return Swap.getValue(1);
30547}
30548
30549static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
30550 SDNode *N = Op.getNode();
30551 MVT VT = N->getSimpleValueType(0);
30552 unsigned Opc = Op.getOpcode();
30553
30554 // Let legalize expand this if it isn't a legal type yet.
30555 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30556 return SDValue();
30557
30558 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
30559 SDLoc DL(N);
30560
30561 // Set the carry flag.
30562 SDValue Carry = Op.getOperand(2);
30563 EVT CarryVT = Carry.getValueType();
30564 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
30565 Carry, DAG.getAllOnesConstant(DL, CarryVT));
30566
30567 bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;
30568 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
30569 Op.getOperand(0), Op.getOperand(1),
30570 Carry.getValue(1));
30571
30572 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
30573 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
30574 Sum.getValue(1), DL, DAG);
30575 if (N->getValueType(1) == MVT::i1)
30576 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
30577
30578 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
30579}
30580
30581static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
30582 SelectionDAG &DAG) {
30583 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30583, __extension__ __PRETTY_FUNCTION__))
;
30584
30585 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
30586 // which returns the values as { float, float } (in XMM0) or
30587 // { double, double } (which is returned in XMM0, XMM1).
30588 SDLoc dl(Op);
30589 SDValue Arg = Op.getOperand(0);
30590 EVT ArgVT = Arg.getValueType();
30591 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
30592
30593 TargetLowering::ArgListTy Args;
30594 TargetLowering::ArgListEntry Entry;
30595
30596 Entry.Node = Arg;
30597 Entry.Ty = ArgTy;
30598 Entry.IsSExt = false;
30599 Entry.IsZExt = false;
30600 Args.push_back(Entry);
30601
30602 bool isF64 = ArgVT == MVT::f64;
30603 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
30604 // the small struct {f32, f32} is returned in (eax, edx). For f64,
30605 // the results are returned via SRet in memory.
30606 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30607 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
30608 const char *LibcallName = TLI.getLibcallName(LC);
30609 SDValue Callee =
30610 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
30611
30612 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
30613 : (Type *)FixedVectorType::get(ArgTy, 4);
30614
30615 TargetLowering::CallLoweringInfo CLI(DAG);
30616 CLI.setDebugLoc(dl)
30617 .setChain(DAG.getEntryNode())
30618 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
30619
30620 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
30621
30622 if (isF64)
30623 // Returned in xmm0 and xmm1.
30624 return CallResult.first;
30625
30626 // Returned in bits 0:31 and 32:64 xmm0.
30627 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30628 CallResult.first, DAG.getIntPtrConstant(0, dl));
30629 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
30630 CallResult.first, DAG.getIntPtrConstant(1, dl));
30631 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
30632 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
30633}
30634
30635/// Widen a vector input to a vector of NVT. The
30636/// input vector must have the same element type as NVT.
30637static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
30638 bool FillWithZeroes = false) {
30639 // Check if InOp already has the right width.
30640 MVT InVT = InOp.getSimpleValueType();
30641 if (InVT == NVT)
30642 return InOp;
30643
30644 if (InOp.isUndef())
30645 return DAG.getUNDEF(NVT);
30646
30647 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30648, __extension__ __PRETTY_FUNCTION__))
30648 "input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30648, __extension__ __PRETTY_FUNCTION__))
;
30649
30650 unsigned InNumElts = InVT.getVectorNumElements();
30651 unsigned WidenNumElts = NVT.getVectorNumElements();
30652 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30653, __extension__ __PRETTY_FUNCTION__))
30653 "Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30653, __extension__ __PRETTY_FUNCTION__))
;
30654
30655 SDLoc dl(InOp);
30656 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
30657 InOp.getNumOperands() == 2) {
30658 SDValue N1 = InOp.getOperand(1);
30659 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
30660 N1.isUndef()) {
30661 InOp = InOp.getOperand(0);
30662 InVT = InOp.getSimpleValueType();
30663 InNumElts = InVT.getVectorNumElements();
30664 }
30665 }
30666 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
30667 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
30668 SmallVector<SDValue, 16> Ops;
30669 for (unsigned i = 0; i < InNumElts; ++i)
30670 Ops.push_back(InOp.getOperand(i));
30671
30672 EVT EltVT = InOp.getOperand(0).getValueType();
30673
30674 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
30675 DAG.getUNDEF(EltVT);
30676 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
30677 Ops.push_back(FillVal);
30678 return DAG.getBuildVector(NVT, dl, Ops);
30679 }
30680 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
30681 DAG.getUNDEF(NVT);
30682 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
30683 InOp, DAG.getIntPtrConstant(0, dl));
30684}
30685
30686static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
30687 SelectionDAG &DAG) {
30688 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30689, __extension__ __PRETTY_FUNCTION__))
30689 "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30689, __extension__ __PRETTY_FUNCTION__))
;
30690
30691 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
30692 SDValue Src = N->getValue();
30693 MVT VT = Src.getSimpleValueType();
30694 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30694, __extension__ __PRETTY_FUNCTION__))
;
30695 SDLoc dl(Op);
30696
30697 SDValue Scale = N->getScale();
30698 SDValue Index = N->getIndex();
30699 SDValue Mask = N->getMask();
30700 SDValue Chain = N->getChain();
30701 SDValue BasePtr = N->getBasePtr();
30702
30703 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
30704 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30704, __extension__ __PRETTY_FUNCTION__))
;
30705 // If the index is v2i64 and we have VLX we can use xmm for data and index.
30706 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
30707 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30708 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
30709 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
30710 SDVTList VTs = DAG.getVTList(MVT::Other);
30711 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30712 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30713 N->getMemoryVT(), N->getMemOperand());
30714 }
30715 return SDValue();
30716 }
30717
30718 MVT IndexVT = Index.getSimpleValueType();
30719
30720 // If the index is v2i32, we're being called by type legalization and we
30721 // should just let the default handling take care of it.
30722 if (IndexVT == MVT::v2i32)
30723 return SDValue();
30724
30725 // If we don't have VLX and neither the passthru or index is 512-bits, we
30726 // need to widen until one is.
30727 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
30728 !Index.getSimpleValueType().is512BitVector()) {
30729 // Determine how much we need to widen by to get a 512-bit type.
30730 unsigned Factor = std::min(512/VT.getSizeInBits(),
30731 512/IndexVT.getSizeInBits());
30732 unsigned NumElts = VT.getVectorNumElements() * Factor;
30733
30734 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30735 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30736 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30737
30738 Src = ExtendToType(Src, VT, DAG);
30739 Index = ExtendToType(Index, IndexVT, DAG);
30740 Mask = ExtendToType(Mask, MaskVT, DAG, true);
30741 }
30742
30743 SDVTList VTs = DAG.getVTList(MVT::Other);
30744 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
30745 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
30746 N->getMemoryVT(), N->getMemOperand());
30747}
30748
30749static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
30750 SelectionDAG &DAG) {
30751
30752 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
30753 MVT VT = Op.getSimpleValueType();
30754 MVT ScalarVT = VT.getScalarType();
30755 SDValue Mask = N->getMask();
30756 MVT MaskVT = Mask.getSimpleValueType();
30757 SDValue PassThru = N->getPassThru();
30758 SDLoc dl(Op);
30759
30760 // Handle AVX masked loads which don't support passthru other than 0.
30761 if (MaskVT.getVectorElementType() != MVT::i1) {
30762 // We also allow undef in the isel pattern.
30763 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
30764 return Op;
30765
30766 SDValue NewLoad = DAG.getMaskedLoad(
30767 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30768 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
30769 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
30770 N->isExpandingLoad());
30771 // Emit a blend.
30772 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
30773 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
30774 }
30775
30776 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30777, __extension__ __PRETTY_FUNCTION__))
30777 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30777, __extension__ __PRETTY_FUNCTION__))
;
30778
30779 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30780, __extension__ __PRETTY_FUNCTION__))
30780 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30780, __extension__ __PRETTY_FUNCTION__))
;
30781
30782 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30783, __extension__ __PRETTY_FUNCTION__))
30783 "Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30783, __extension__ __PRETTY_FUNCTION__))
;
30784
30785 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30788, __extension__ __PRETTY_FUNCTION__))
30786 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30788, __extension__ __PRETTY_FUNCTION__))
30787 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30788, __extension__ __PRETTY_FUNCTION__))
30788 "Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30788, __extension__ __PRETTY_FUNCTION__))
;
30789
30790 // This operation is legal for targets with VLX, but without
30791 // VLX the vector should be widened to 512 bit
30792 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
30793 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30794 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
30795
30796 // Mask element has to be i1.
30797 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30798, __extension__ __PRETTY_FUNCTION__))
30798 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30798, __extension__ __PRETTY_FUNCTION__))
;
30799
30800 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30801
30802 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30803 SDValue NewLoad = DAG.getMaskedLoad(
30804 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
30805 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
30806 N->getExtensionType(), N->isExpandingLoad());
30807
30808 SDValue Extract =
30809 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
30810 DAG.getIntPtrConstant(0, dl));
30811 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
30812 return DAG.getMergeValues(RetOps, dl);
30813}
30814
30815static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
30816 SelectionDAG &DAG) {
30817 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
30818 SDValue DataToStore = N->getValue();
30819 MVT VT = DataToStore.getSimpleValueType();
30820 MVT ScalarVT = VT.getScalarType();
30821 SDValue Mask = N->getMask();
30822 SDLoc dl(Op);
30823
30824 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30825, __extension__ __PRETTY_FUNCTION__))
30825 "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30825, __extension__ __PRETTY_FUNCTION__))
;
30826
30827 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30828, __extension__ __PRETTY_FUNCTION__))
30828 "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30828, __extension__ __PRETTY_FUNCTION__))
;
30829
30830 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30831, __extension__ __PRETTY_FUNCTION__))
30831 "Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30831, __extension__ __PRETTY_FUNCTION__))
;
30832
30833 assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30836, __extension__ __PRETTY_FUNCTION__))
30834 (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30836, __extension__ __PRETTY_FUNCTION__))
30835 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30836, __extension__ __PRETTY_FUNCTION__))
30836 "Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30836, __extension__ __PRETTY_FUNCTION__))
;
30837
30838 // This operation is legal for targets with VLX, but without
30839 // VLX the vector should be widened to 512 bit
30840 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
30841 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
30842
30843 // Mask element has to be i1.
30844 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30845, __extension__ __PRETTY_FUNCTION__))
30845 "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30845, __extension__ __PRETTY_FUNCTION__))
;
30846
30847 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
30848
30849 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
30850 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
30851 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
30852 N->getOffset(), Mask, N->getMemoryVT(),
30853 N->getMemOperand(), N->getAddressingMode(),
30854 N->isTruncatingStore(), N->isCompressingStore());
30855}
30856
30857static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
30858 SelectionDAG &DAG) {
30859 assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30860, __extension__ __PRETTY_FUNCTION__))
30860 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30860, __extension__ __PRETTY_FUNCTION__))
;
30861
30862 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
30863 SDLoc dl(Op);
30864 MVT VT = Op.getSimpleValueType();
30865 SDValue Index = N->getIndex();
30866 SDValue Mask = N->getMask();
30867 SDValue PassThru = N->getPassThru();
30868 MVT IndexVT = Index.getSimpleValueType();
30869
30870 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30870, __extension__ __PRETTY_FUNCTION__))
;
30871
30872 // If the index is v2i32, we're being called by type legalization.
30873 if (IndexVT == MVT::v2i32)
30874 return SDValue();
30875
30876 // If we don't have VLX and neither the passthru or index is 512-bits, we
30877 // need to widen until one is.
30878 MVT OrigVT = VT;
30879 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
30880 !IndexVT.is512BitVector()) {
30881 // Determine how much we need to widen by to get a 512-bit type.
30882 unsigned Factor = std::min(512/VT.getSizeInBits(),
30883 512/IndexVT.getSizeInBits());
30884
30885 unsigned NumElts = VT.getVectorNumElements() * Factor;
30886
30887 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
30888 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
30889 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30890
30891 PassThru = ExtendToType(PassThru, VT, DAG);
30892 Index = ExtendToType(Index, IndexVT, DAG);
30893 Mask = ExtendToType(Mask, MaskVT, DAG, true);
30894 }
30895
30896 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
30897 N->getScale() };
30898 SDValue NewGather = DAG.getMemIntrinsicNode(
30899 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
30900 N->getMemOperand());
30901 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
30902 NewGather, DAG.getIntPtrConstant(0, dl));
30903 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
30904}
30905
30906static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
30907 SDLoc dl(Op);
30908 SDValue Src = Op.getOperand(0);
30909 MVT DstVT = Op.getSimpleValueType();
30910
30911 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
30912 unsigned SrcAS = N->getSrcAddressSpace();
30913
30914 assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30915, __extension__ __PRETTY_FUNCTION__))
30915 "addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30915, __extension__ __PRETTY_FUNCTION__))
;
30916
30917 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
30918 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
30919 } else if (DstVT == MVT::i64) {
30920 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
30921 } else if (DstVT == MVT::i32) {
30922 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
30923 } else {
30924 report_fatal_error("Bad address space in addrspacecast");
30925 }
30926 return Op;
30927}
30928
30929SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
30930 SelectionDAG &DAG) const {
30931 // TODO: Eventually, the lowering of these nodes should be informed by or
30932 // deferred to the GC strategy for the function in which they appear. For
30933 // now, however, they must be lowered to something. Since they are logically
30934 // no-ops in the case of a null GC strategy (or a GC strategy which does not
30935 // require special handling for these nodes), lower them as literal NOOPs for
30936 // the time being.
30937 SmallVector<SDValue, 2> Ops;
30938
30939 Ops.push_back(Op.getOperand(0));
30940 if (Op->getGluedNode())
30941 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
30942
30943 SDLoc OpDL(Op);
30944 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
30945 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
30946
30947 return NOOP;
30948}
30949
30950// Custom split CVTPS2PH with wide types.
30951static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
30952 SDLoc dl(Op);
30953 EVT VT = Op.getValueType();
30954 SDValue Lo, Hi;
30955 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
30956 EVT LoVT, HiVT;
30957 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
30958 SDValue RC = Op.getOperand(1);
30959 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
30960 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
30961 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
30962}
30963
30964/// Provide custom lowering hooks for some operations.
30965SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
30966 switch (Op.getOpcode()) {
30967 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30967)
;
30968 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
30969 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
30970 return LowerCMP_SWAP(Op, Subtarget, DAG);
30971 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
30972 case ISD::ATOMIC_LOAD_ADD:
30973 case ISD::ATOMIC_LOAD_SUB:
30974 case ISD::ATOMIC_LOAD_OR:
30975 case ISD::ATOMIC_LOAD_XOR:
30976 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
30977 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
30978 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
30979 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
30980 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
30981 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
30982 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
30983 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
30984 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
30985 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
30986 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
30987 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
30988 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
30989 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
30990 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
30991 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
30992 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
30993 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
30994 case ISD::SHL_PARTS:
30995 case ISD::SRA_PARTS:
30996 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
30997 case ISD::FSHL:
30998 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
30999 case ISD::STRICT_SINT_TO_FP:
31000 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
31001 case ISD::STRICT_UINT_TO_FP:
31002 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
31003 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
31004 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
31005 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
31006 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
31007 case ISD::ZERO_EXTEND_VECTOR_INREG:
31008 case ISD::SIGN_EXTEND_VECTOR_INREG:
31009 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
31010 case ISD::FP_TO_SINT:
31011 case ISD::STRICT_FP_TO_SINT:
31012 case ISD::FP_TO_UINT:
31013 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
31014 case ISD::FP_TO_SINT_SAT:
31015 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
31016 case ISD::FP_EXTEND:
31017 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
31018 case ISD::FP_ROUND:
31019 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
31020 case ISD::FP16_TO_FP:
31021 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
31022 case ISD::FP_TO_FP16:
31023 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
31024 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
31025 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
31026 case ISD::FADD:
31027 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
31028 case ISD::FROUND: return LowerFROUND(Op, DAG);
31029 case ISD::FABS:
31030 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
31031 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
31032 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
31033 case ISD::ISNAN: return lowerISNAN(Op, DAG);
31034 case ISD::LRINT:
31035 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
31036 case ISD::SETCC:
31037 case ISD::STRICT_FSETCC:
31038 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
31039 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
31040 case ISD::SELECT: return LowerSELECT(Op, DAG);
31041 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
31042 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
31043 case ISD::VASTART: return LowerVASTART(Op, DAG);
31044 case ISD::VAARG: return LowerVAARG(Op, DAG);
31045 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
31046 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
31047 case ISD::INTRINSIC_VOID:
31048 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
31049 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
31050 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
31051 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
31052 case ISD::FRAME_TO_ARGS_OFFSET:
31053 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
31054 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
31055 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
31056 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
31057 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
31058 case ISD::EH_SJLJ_SETUP_DISPATCH:
31059 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
31060 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
31061 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
31062 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
31063 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
31064 case ISD::CTLZ:
31065 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
31066 case ISD::CTTZ:
31067 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
31068 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
31069 case ISD::MULHS:
31070 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
31071 case ISD::ROTL:
31072 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
31073 case ISD::SRA:
31074 case ISD::SRL:
31075 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
31076 case ISD::SADDO:
31077 case ISD::UADDO:
31078 case ISD::SSUBO:
31079 case ISD::USUBO: return LowerXALUO(Op, DAG);
31080 case ISD::SMULO:
31081 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
31082 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
31083 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
31084 case ISD::SADDO_CARRY:
31085 case ISD::SSUBO_CARRY:
31086 case ISD::ADDCARRY:
31087 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
31088 case ISD::ADD:
31089 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
31090 case ISD::UADDSAT:
31091 case ISD::SADDSAT:
31092 case ISD::USUBSAT:
31093 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
31094 case ISD::SMAX:
31095 case ISD::SMIN:
31096 case ISD::UMAX:
31097 case ISD::UMIN: return LowerMINMAX(Op, DAG);
31098 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
31099 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
31100 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
31101 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
31102 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
31103 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
31104 case ISD::GC_TRANSITION_START:
31105 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
31106 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
31107 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
31108 }
31109}
31110
31111/// Replace a node with an illegal result type with a new node built out of
31112/// custom code.
31113void X86TargetLowering::ReplaceNodeResults(SDNode *N,
31114 SmallVectorImpl<SDValue>&Results,
31115 SelectionDAG &DAG) const {
31116 SDLoc dl(N);
31117 switch (N->getOpcode()) {
31118 default:
31119#ifndef NDEBUG
31120 dbgs() << "ReplaceNodeResults: ";
31121 N->dump(&DAG);
31122#endif
31123 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31123)
;
31124 case X86ISD::CVTPH2PS: {
31125 EVT VT = N->getValueType(0);
31126 SDValue Lo, Hi;
31127 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31128 EVT LoVT, HiVT;
31129 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31130 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
31131 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
31132 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31133 Results.push_back(Res);
31134 return;
31135 }
31136 case X86ISD::STRICT_CVTPH2PS: {
31137 EVT VT = N->getValueType(0);
31138 SDValue Lo, Hi;
31139 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
31140 EVT LoVT, HiVT;
31141 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
31142 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
31143 {N->getOperand(0), Lo});
31144 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
31145 {N->getOperand(0), Hi});
31146 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31147 Lo.getValue(1), Hi.getValue(1));
31148 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31149 Results.push_back(Res);
31150 Results.push_back(Chain);
31151 return;
31152 }
31153 case X86ISD::CVTPS2PH:
31154 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
31155 return;
31156 case ISD::CTPOP: {
31157 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31157, __extension__ __PRETTY_FUNCTION__))
;
31158 // Use a v2i64 if possible.
31159 bool NoImplicitFloatOps =
31160 DAG.getMachineFunction().getFunction().hasFnAttribute(
31161 Attribute::NoImplicitFloat);
31162 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
31163 SDValue Wide =
31164 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
31165 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
31166 // Bit count should fit in 32-bits, extract it as that and then zero
31167 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
31168 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
31169 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
31170 DAG.getIntPtrConstant(0, dl));
31171 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
31172 Results.push_back(Wide);
31173 }
31174 return;
31175 }
31176 case ISD::MUL: {
31177 EVT VT = N->getValueType(0);
31178 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31179, __extension__ __PRETTY_FUNCTION__))
31179 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31179, __extension__ __PRETTY_FUNCTION__))
;
31180 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
31181 // elements are needed.
31182 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
31183 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
31184 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
31185 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
31186 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
31187 unsigned NumConcats = 16 / VT.getVectorNumElements();
31188 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
31189 ConcatOps[0] = Res;
31190 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
31191 Results.push_back(Res);
31192 return;
31193 }
31194 case X86ISD::VPMADDWD:
31195 case X86ISD::AVG: {
31196 // Legalize types for X86ISD::AVG/VPMADDWD by widening.
31197 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31197, __extension__ __PRETTY_FUNCTION__))
;
31198
31199 EVT VT = N->getValueType(0);
31200 EVT InVT = N->getOperand(0).getValueType();
31201 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31202, __extension__ __PRETTY_FUNCTION__))
31202 "Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31202, __extension__ __PRETTY_FUNCTION__))
;
31203 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31204, __extension__ __PRETTY_FUNCTION__))
31204 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31204, __extension__ __PRETTY_FUNCTION__))
;
31205 unsigned NumConcat = 128 / InVT.getSizeInBits();
31206
31207 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
31208 InVT.getVectorElementType(),
31209 NumConcat * InVT.getVectorNumElements());
31210 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
31211 VT.getVectorElementType(),
31212 NumConcat * VT.getVectorNumElements());
31213
31214 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
31215 Ops[0] = N->getOperand(0);
31216 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
31217 Ops[0] = N->getOperand(1);
31218 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
31219
31220 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
31221 Results.push_back(Res);
31222 return;
31223 }
31224 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
31225 case X86ISD::FMINC:
31226 case X86ISD::FMIN:
31227 case X86ISD::FMAXC:
31228 case X86ISD::FMAX: {
31229 EVT VT = N->getValueType(0);
31230 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31230, __extension__ __PRETTY_FUNCTION__))
;
31231 SDValue UNDEF = DAG.getUNDEF(VT);
31232 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
31233 N->getOperand(0), UNDEF);
31234 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
31235 N->getOperand(1), UNDEF);
31236 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
31237 return;
31238 }
31239 case ISD::SDIV:
31240 case ISD::UDIV:
31241 case ISD::SREM:
31242 case ISD::UREM: {
31243 EVT VT = N->getValueType(0);
31244 if (VT.isVector()) {
31245 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31246, __extension__ __PRETTY_FUNCTION__))
31246 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31246, __extension__ __PRETTY_FUNCTION__))
;
31247 // If this RHS is a constant splat vector we can widen this and let
31248 // division/remainder by constant optimize it.
31249 // TODO: Can we do something for non-splat?
31250 APInt SplatVal;
31251 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
31252 unsigned NumConcats = 128 / VT.getSizeInBits();
31253 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
31254 Ops0[0] = N->getOperand(0);
31255 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
31256 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
31257 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
31258 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
31259 Results.push_back(Res);
31260 }
31261 return;
31262 }
31263
31264 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
31265 Results.push_back(V);
31266 return;
31267 }
31268 case ISD::TRUNCATE: {
31269 MVT VT = N->getSimpleValueType(0);
31270 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
31271 return;
31272
31273 // The generic legalizer will try to widen the input type to the same
31274 // number of elements as the widened result type. But this isn't always
31275 // the best thing so do some custom legalization to avoid some cases.
31276 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
31277 SDValue In = N->getOperand(0);
31278 EVT InVT = In.getValueType();
31279
31280 unsigned InBits = InVT.getSizeInBits();
31281 if (128 % InBits == 0) {
31282 // 128 bit and smaller inputs should avoid truncate all together and
31283 // just use a build_vector that will become a shuffle.
31284 // TODO: Widen and use a shuffle directly?
31285 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
31286 EVT EltVT = VT.getVectorElementType();
31287 unsigned WidenNumElts = WidenVT.getVectorNumElements();
31288 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
31289 // Use the original element count so we don't do more scalar opts than
31290 // necessary.
31291 unsigned MinElts = VT.getVectorNumElements();
31292 for (unsigned i=0; i < MinElts; ++i) {
31293 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
31294 DAG.getIntPtrConstant(i, dl));
31295 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
31296 }
31297 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
31298 return;
31299 }
31300 // With AVX512 there are some cases that can use a target specific
31301 // truncate node to go from 256/512 to less than 128 with zeros in the
31302 // upper elements of the 128 bit result.
31303 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
31304 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
31305 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
31306 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
31307 return;
31308 }
31309 // There's one case we can widen to 512 bits and use VTRUNC.
31310 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
31311 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
31312 DAG.getUNDEF(MVT::v4i64));
31313 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
31314 return;
31315 }
31316 }
31317 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
31318 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
31319 isTypeLegal(MVT::v4i64)) {
31320 // Input needs to be split and output needs to widened. Let's use two
31321 // VTRUNCs, and shuffle their results together into the wider type.
31322 SDValue Lo, Hi;
31323 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
31324
31325 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
31326 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
31327 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
31328 { 0, 1, 2, 3, 16, 17, 18, 19,
31329 -1, -1, -1, -1, -1, -1, -1, -1 });
31330 Results.push_back(Res);
31331 return;
31332 }
31333
31334 return;
31335 }
31336 case ISD::ANY_EXTEND:
31337 // Right now, only MVT::v8i8 has Custom action for an illegal type.
31338 // It's intended to custom handle the input type.
31339 assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31340, __extension__ __PRETTY_FUNCTION__))
31340 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31340, __extension__ __PRETTY_FUNCTION__))
;
31341 return;
31342 case ISD::SIGN_EXTEND:
31343 case ISD::ZERO_EXTEND: {
31344 EVT VT = N->getValueType(0);
31345 SDValue In = N->getOperand(0);
31346 EVT InVT = In.getValueType();
31347 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
31348 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
31349 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31350, __extension__ __PRETTY_FUNCTION__))
31350 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31350, __extension__ __PRETTY_FUNCTION__))
;
31351 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31351, __extension__ __PRETTY_FUNCTION__))
;
31352 // Custom split this so we can extend i8/i16->i32 invec. This is better
31353 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
31354 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
31355 // we allow the sra from the extend to i32 to be shared by the split.
31356 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
31357
31358 // Fill a vector with sign bits for each element.
31359 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
31360 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
31361
31362 // Create an unpackl and unpackh to interleave the sign bits then bitcast
31363 // to v2i64.
31364 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
31365 {0, 4, 1, 5});
31366 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
31367 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
31368 {2, 6, 3, 7});
31369 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
31370
31371 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31372 Results.push_back(Res);
31373 return;
31374 }
31375
31376 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
31377 if (!InVT.is128BitVector()) {
31378 // Not a 128 bit vector, but maybe type legalization will promote
31379 // it to 128 bits.
31380 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
31381 return;
31382 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
31383 if (!InVT.is128BitVector())
31384 return;
31385
31386 // Promote the input to 128 bits. Type legalization will turn this into
31387 // zext_inreg/sext_inreg.
31388 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
31389 }
31390
31391 // Perform custom splitting instead of the two stage extend we would get
31392 // by default.
31393 EVT LoVT, HiVT;
31394 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
31395 assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31395, __extension__ __PRETTY_FUNCTION__))
;
31396
31397 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
31398
31399 // We need to shift the input over by half the number of elements.
31400 unsigned NumElts = InVT.getVectorNumElements();
31401 unsigned HalfNumElts = NumElts / 2;
31402 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
31403 for (unsigned i = 0; i != HalfNumElts; ++i)
31404 ShufMask[i] = i + HalfNumElts;
31405
31406 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
31407 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
31408
31409 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
31410 Results.push_back(Res);
31411 }
31412 return;
31413 }
31414 case ISD::FP_TO_SINT:
31415 case ISD::STRICT_FP_TO_SINT:
31416 case ISD::FP_TO_UINT:
31417 case ISD::STRICT_FP_TO_UINT: {
31418 bool IsStrict = N->isStrictFPOpcode();
31419 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
31420 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
31421 EVT VT = N->getValueType(0);
31422 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31423 EVT SrcVT = Src.getValueType();
31424
31425 if (VT.isVector() && Subtarget.hasFP16() &&
31426 SrcVT.getVectorElementType() == MVT::f16) {
31427 EVT EleVT = VT.getVectorElementType();
31428 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
31429
31430 if (SrcVT != MVT::v8f16) {
31431 SDValue Tmp =
31432 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
31433 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
31434 Ops[0] = Src;
31435 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
31436 }
31437
31438 SDValue Res, Chain;
31439 if (IsStrict) {
31440 unsigned Opc =
31441 IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
31442 Res =
31443 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
31444 Chain = Res.getValue(1);
31445 } else {
31446 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
31447 Res = DAG.getNode(Opc, dl, ResVT, Src);
31448 }
31449
31450 // TODO: Need to add exception check code for strict FP.
31451 if (EleVT.getSizeInBits() < 16) {
31452 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
31453 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
31454
31455 // Now widen to 128 bits.
31456 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
31457 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
31458 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
31459 ConcatOps[0] = Res;
31460 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
31461 }
31462
31463 Results.push_back(Res);
31464 if (IsStrict)
31465 Results.push_back(Chain);
31466
31467 return;
31468 }
31469
31470 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
31471 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31472, __extension__ __PRETTY_FUNCTION__))
31472 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31472, __extension__ __PRETTY_FUNCTION__))
;
31473
31474 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
31475 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
31476 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
31477 VT.getVectorNumElements());
31478 SDValue Res;
31479 SDValue Chain;
31480 if (IsStrict) {
31481 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
31482 {N->getOperand(0), Src});
31483 Chain = Res.getValue(1);
31484 } else
31485 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
31486
31487 // Preserve what we know about the size of the original result. If the
31488 // result is v2i32, we have to manually widen the assert.
31489 if (PromoteVT == MVT::v2i32)
31490 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
31491 DAG.getUNDEF(MVT::v2i32));
31492
31493 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
31494 Res.getValueType(), Res,
31495 DAG.getValueType(VT.getVectorElementType()));
31496
31497 if (PromoteVT == MVT::v2i32)
31498 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
31499 DAG.getIntPtrConstant(0, dl));
31500
31501 // Truncate back to the original width.
31502 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
31503
31504 // Now widen to 128 bits.
31505 unsigned NumConcats = 128 / VT.getSizeInBits();
31506 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
31507 VT.getVectorNumElements() * NumConcats);
31508 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
31509 ConcatOps[0] = Res;
31510 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
31511 Results.push_back(Res);
31512 if (IsStrict)
31513 Results.push_back(Chain);
31514 return;
31515 }
31516
31517
31518 if (VT == MVT::v2i32) {
31519 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31520, __extension__ __PRETTY_FUNCTION__))
31520 "Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31520, __extension__ __PRETTY_FUNCTION__))
;
31521 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31521, __extension__ __PRETTY_FUNCTION__))
;
31522 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31523, __extension__ __PRETTY_FUNCTION__))
31523 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31523, __extension__ __PRETTY_FUNCTION__))
;
31524 if (Src.getValueType() == MVT::v2f64) {
31525 if (!IsSigned && !Subtarget.hasAVX512()) {
31526 SDValue Res =
31527 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
31528 Results.push_back(Res);
31529 return;
31530 }
31531
31532 unsigned Opc;
31533 if (IsStrict)
31534 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
31535 else
31536 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
31537
31538 // If we have VLX we can emit a target specific FP_TO_UINT node,.
31539 if (!IsSigned && !Subtarget.hasVLX()) {
31540 // Otherwise we can defer to the generic legalizer which will widen
31541 // the input as well. This will be further widened during op
31542 // legalization to v8i32<-v8f64.
31543 // For strict nodes we'll need to widen ourselves.
31544 // FIXME: Fix the type legalizer to safely widen strict nodes?
31545 if (!IsStrict)
31546 return;
31547 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
31548 DAG.getConstantFP(0.0, dl, MVT::v2f64));
31549 Opc = N->getOpcode();
31550 }
31551 SDValue Res;
31552 SDValue Chain;
31553 if (IsStrict) {
31554 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
31555 {N->getOperand(0), Src});
31556 Chain = Res.getValue(1);
31557 } else {
31558 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
31559 }
31560 Results.push_back(Res);
31561 if (IsStrict)
31562 Results.push_back(Chain);
31563 return;
31564 }
31565
31566 // Custom widen strict v2f32->v2i32 by padding with zeros.
31567 // FIXME: Should generic type legalizer do this?
31568 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
31569 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
31570 DAG.getConstantFP(0.0, dl, MVT::v2f32));
31571 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
31572 {N->getOperand(0), Src});
31573 Results.push_back(Res);
31574 Results.push_back(Res.getValue(1));
31575 return;
31576 }
31577
31578 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
31579 // so early out here.
31580 return;
31581 }
31582
31583 assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31583, __extension__ __PRETTY_FUNCTION__))
;
31584
31585 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
31586 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
31587 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
31588 assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31588, __extension__ __PRETTY_FUNCTION__))
;
31589 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
31590 // If we use a 128-bit result we might need to use a target specific node.
31591 unsigned SrcElts =
31592 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
31593 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
31594 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
31595 unsigned Opc = N->getOpcode();
31596 if (NumElts != SrcElts) {
31597 if (IsStrict)
31598 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
31599 else
31600 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
31601 }
31602
31603 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
31604 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
31605 DAG.getConstantFP(0.0, dl, VecInVT), Src,
31606 ZeroIdx);
31607 SDValue Chain;
31608 if (IsStrict) {
31609 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
31610 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
31611 Chain = Res.getValue(1);
31612 } else
31613 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
31614 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
31615 Results.push_back(Res);
31616 if (IsStrict)
31617 Results.push_back(Chain);
31618 return;
31619 }
31620
31621 SDValue Chain;
31622 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
31623 Results.push_back(V);
31624 if (IsStrict)
31625 Results.push_back(Chain);
31626 }
31627 return;
31628 }
31629 case ISD::LRINT:
31630 case ISD::LLRINT: {
31631 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
31632 Results.push_back(V);
31633 return;
31634 }
31635
31636 case ISD::SINT_TO_FP:
31637 case ISD::STRICT_SINT_TO_FP:
31638 case ISD::UINT_TO_FP:
31639 case ISD::STRICT_UINT_TO_FP: {
31640 bool IsStrict = N->isStrictFPOpcode();
31641 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
31642 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
31643 EVT VT = N->getValueType(0);
31644 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31645 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
31646 Subtarget.hasVLX()) {
31647 if (Src.getValueType().getVectorElementType() == MVT::i16)
31648 return;
31649
31650 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
31651 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
31652 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
31653 : DAG.getUNDEF(MVT::v2i32));
31654 if (IsStrict) {
31655 unsigned Opc =
31656 IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
31657 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
31658 {N->getOperand(0), Src});
31659 Results.push_back(Res);
31660 Results.push_back(Res.getValue(1));
31661 } else {
31662 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
31663 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
31664 }
31665 return;
31666 }
31667 if (VT != MVT::v2f32)
31668 return;
31669 EVT SrcVT = Src.getValueType();
31670 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
31671 if (IsStrict) {
31672 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
31673 : X86ISD::STRICT_CVTUI2P;
31674 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
31675 {N->getOperand(0), Src});
31676 Results.push_back(Res);
31677 Results.push_back(Res.getValue(1));
31678 } else {
31679 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
31680 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
31681 }
31682 return;
31683 }
31684 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
31685 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
31686 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
31687 SDValue One = DAG.getConstant(1, dl, SrcVT);
31688 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
31689 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
31690 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
31691 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
31692 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
31693 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
31694 for (int i = 0; i != 2; ++i) {
31695 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
31696 SignSrc, DAG.getIntPtrConstant(i, dl));
31697 if (IsStrict)
31698 SignCvts[i] =
31699 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
31700 {N->getOperand(0), Elt});
31701 else
31702 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
31703 };
31704 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
31705 SDValue Slow, Chain;
31706 if (IsStrict) {
31707 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31708 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
31709 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
31710 {Chain, SignCvt, SignCvt});
31711 Chain = Slow.getValue(1);
31712 } else {
31713 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
31714 }
31715 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
31716 IsNeg =
31717 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
31718 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
31719 Results.push_back(Cvt);
31720 if (IsStrict)
31721 Results.push_back(Chain);
31722 return;
31723 }
31724
31725 if (SrcVT != MVT::v2i32)
31726 return;
31727
31728 if (IsSigned || Subtarget.hasAVX512()) {
31729 if (!IsStrict)
31730 return;
31731
31732 // Custom widen strict v2i32->v2f32 to avoid scalarization.
31733 // FIXME: Should generic type legalizer do this?
31734 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
31735 DAG.getConstant(0, dl, MVT::v2i32));
31736 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
31737 {N->getOperand(0), Src});
31738 Results.push_back(Res);
31739 Results.push_back(Res.getValue(1));
31740 return;
31741 }
31742
31743 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31743, __extension__ __PRETTY_FUNCTION__))
;
31744 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
31745 SDValue VBias =
31746 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
31747 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
31748 DAG.getBitcast(MVT::v2i64, VBias));
31749 Or = DAG.getBitcast(MVT::v2f64, Or);
31750 if (IsStrict) {
31751 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
31752 {N->getOperand(0), Or, VBias});
31753 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
31754 {MVT::v4f32, MVT::Other},
31755 {Sub.getValue(1), Sub});
31756 Results.push_back(Res);
31757 Results.push_back(Res.getValue(1));
31758 } else {
31759 // TODO: Are there any fast-math-flags to propagate here?
31760 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
31761 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
31762 }
31763 return;
31764 }
31765 case ISD::STRICT_FP_ROUND:
31766 case ISD::FP_ROUND: {
31767 bool IsStrict = N->isStrictFPOpcode();
31768 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31769 EVT VT = N->getValueType(0);
31770 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
31771 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
31772 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
31773 : DAG.getUNDEF(MVT::v2f32);
31774 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
31775 }
31776 if (!isTypeLegal(Src.getValueType()))
31777 return;
31778 SDValue V;
31779 if (IsStrict)
31780 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
31781 {N->getOperand(0), Src});
31782 else
31783 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
31784 Results.push_back(V);
31785 if (IsStrict)
31786 Results.push_back(V.getValue(1));
31787 return;
31788 }
31789 case ISD::FP_EXTEND:
31790 case ISD::STRICT_FP_EXTEND: {
31791 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
31792 // No other ValueType for FP_EXTEND should reach this point.
31793 assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31794, __extension__ __PRETTY_FUNCTION__))
31794 "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31794, __extension__ __PRETTY_FUNCTION__))
;
31795 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
31796 return;
31797 bool IsStrict = N->isStrictFPOpcode();
31798 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
31799 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
31800 : DAG.getUNDEF(MVT::v2f16);
31801 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
31802 if (IsStrict)
31803 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
31804 {N->getOperand(0), V});
31805 else
31806 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
31807 Results.push_back(V);
31808 if (IsStrict)
31809 Results.push_back(V.getValue(1));
31810 return;
31811 }
31812 case ISD::INTRINSIC_W_CHAIN: {
31813 unsigned IntNo = N->getConstantOperandVal(1);
31814 switch (IntNo) {
31815 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31816)
31816 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31816)
;
31817 case Intrinsic::x86_rdtsc:
31818 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
31819 Results);
31820 case Intrinsic::x86_rdtscp:
31821 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
31822 Results);
31823 case Intrinsic::x86_rdpmc:
31824 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
31825 Results);
31826 return;
31827 case Intrinsic::x86_xgetbv:
31828 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
31829 Results);
31830 return;
31831 }
31832 }
31833 case ISD::READCYCLECOUNTER: {
31834 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
31835 }
31836 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
31837 EVT T = N->getValueType(0);
31838 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31838, __extension__ __PRETTY_FUNCTION__))
;
31839 bool Regs64bit = T == MVT::i128;
31840 assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&(static_cast <bool> ((!Regs64bit || Subtarget.hasCmpxchg16b
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31841, __extension__ __PRETTY_FUNCTION__))
31841 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.hasCmpxchg16b
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31841, __extension__ __PRETTY_FUNCTION__))
;
31842 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
31843 SDValue cpInL, cpInH;
31844 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31845 DAG.getConstant(0, dl, HalfT));
31846 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
31847 DAG.getConstant(1, dl, HalfT));
31848 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
31849 Regs64bit ? X86::RAX : X86::EAX,
31850 cpInL, SDValue());
31851 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
31852 Regs64bit ? X86::RDX : X86::EDX,
31853 cpInH, cpInL.getValue(1));
31854 SDValue swapInL, swapInH;
31855 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31856 DAG.getConstant(0, dl, HalfT));
31857 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
31858 DAG.getConstant(1, dl, HalfT));
31859 swapInH =
31860 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
31861 swapInH, cpInH.getValue(1));
31862
31863 // In 64-bit mode we might need the base pointer in RBX, but we can't know
31864 // until later. So we keep the RBX input in a vreg and use a custom
31865 // inserter.
31866 // Since RBX will be a reserved register the register allocator will not
31867 // make sure its value will be properly saved and restored around this
31868 // live-range.
31869 SDValue Result;
31870 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31871 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
31872 if (Regs64bit) {
31873 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
31874 swapInH.getValue(1)};
31875 Result =
31876 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
31877 } else {
31878 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
31879 swapInH.getValue(1));
31880 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
31881 swapInL.getValue(1)};
31882 Result =
31883 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
31884 }
31885
31886 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
31887 Regs64bit ? X86::RAX : X86::EAX,
31888 HalfT, Result.getValue(1));
31889 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
31890 Regs64bit ? X86::RDX : X86::EDX,
31891 HalfT, cpOutL.getValue(2));
31892 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
31893
31894 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
31895 MVT::i32, cpOutH.getValue(2));
31896 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
31897 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
31898
31899 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
31900 Results.push_back(Success);
31901 Results.push_back(EFLAGS.getValue(1));
31902 return;
31903 }
31904 case ISD::ATOMIC_LOAD: {
31905 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31905, __extension__ __PRETTY_FUNCTION__))
;
31906 bool NoImplicitFloatOps =
31907 DAG.getMachineFunction().getFunction().hasFnAttribute(
31908 Attribute::NoImplicitFloat);
31909 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31910 auto *Node = cast<AtomicSDNode>(N);
31911 if (Subtarget.hasSSE1()) {
31912 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
31913 // Then extract the lower 64-bits.
31914 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31915 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
31916 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31917 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
31918 MVT::i64, Node->getMemOperand());
31919 if (Subtarget.hasSSE2()) {
31920 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
31921 DAG.getIntPtrConstant(0, dl));
31922 Results.push_back(Res);
31923 Results.push_back(Ld.getValue(1));
31924 return;
31925 }
31926 // We use an alternative sequence for SSE1 that extracts as v2f32 and
31927 // then casts to i64. This avoids a 128-bit stack temporary being
31928 // created by type legalization if we were to cast v4f32->v2i64.
31929 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
31930 DAG.getIntPtrConstant(0, dl));
31931 Res = DAG.getBitcast(MVT::i64, Res);
31932 Results.push_back(Res);
31933 Results.push_back(Ld.getValue(1));
31934 return;
31935 }
31936 if (Subtarget.hasX87()) {
31937 // First load this into an 80-bit X87 register. This will put the whole
31938 // integer into the significand.
31939 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31940 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
31941 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
31942 dl, Tys, Ops, MVT::i64,
31943 Node->getMemOperand());
31944 SDValue Chain = Result.getValue(1);
31945
31946 // Now store the X87 register to a stack temporary and convert to i64.
31947 // This store is not atomic and doesn't need to be.
31948 // FIXME: We don't need a stack temporary if the result of the load
31949 // is already being stored. We could just directly store there.
31950 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31951 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31952 MachinePointerInfo MPI =
31953 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
31954 SDValue StoreOps[] = { Chain, Result, StackPtr };
31955 Chain = DAG.getMemIntrinsicNode(
31956 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
31957 MPI, None /*Align*/, MachineMemOperand::MOStore);
31958
31959 // Finally load the value back from the stack temporary and return it.
31960 // This load is not atomic and doesn't need to be.
31961 // This load will be further type legalized.
31962 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
31963 Results.push_back(Result);
31964 Results.push_back(Result.getValue(1));
31965 return;
31966 }
31967 }
31968 // TODO: Use MOVLPS when SSE1 is available?
31969 // Delegate to generic TypeLegalization. Situations we can really handle
31970 // should have already been dealt with by AtomicExpandPass.cpp.
31971 break;
31972 }
31973 case ISD::ATOMIC_SWAP:
31974 case ISD::ATOMIC_LOAD_ADD:
31975 case ISD::ATOMIC_LOAD_SUB:
31976 case ISD::ATOMIC_LOAD_AND:
31977 case ISD::ATOMIC_LOAD_OR:
31978 case ISD::ATOMIC_LOAD_XOR:
31979 case ISD::ATOMIC_LOAD_NAND:
31980 case ISD::ATOMIC_LOAD_MIN:
31981 case ISD::ATOMIC_LOAD_MAX:
31982 case ISD::ATOMIC_LOAD_UMIN:
31983 case ISD::ATOMIC_LOAD_UMAX:
31984 // Delegate to generic TypeLegalization. Situations we can really handle
31985 // should have already been dealt with by AtomicExpandPass.cpp.
31986 break;
31987
31988 case ISD::BITCAST: {
31989 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31989, __extension__ __PRETTY_FUNCTION__))
;
31990 EVT DstVT = N->getValueType(0);
31991 EVT SrcVT = N->getOperand(0).getValueType();
31992
31993 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
31994 // we can split using the k-register rather than memory.
31995 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
31996 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31996, __extension__ __PRETTY_FUNCTION__))
;
31997 SDValue Lo, Hi;
31998 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
31999 Lo = DAG.getBitcast(MVT::i32, Lo);
32000 Hi = DAG.getBitcast(MVT::i32, Hi);
32001 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
32002 Results.push_back(Res);
32003 return;
32004 }
32005
32006 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
32007 // FIXME: Use v4f32 for SSE1?
32008 assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32008, __extension__ __PRETTY_FUNCTION__))
;
32009 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32010, __extension__ __PRETTY_FUNCTION__))
32010 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32010, __extension__ __PRETTY_FUNCTION__))
;
32011 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
32012 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
32013 N->getOperand(0));
32014 Res = DAG.getBitcast(WideVT, Res);
32015 Results.push_back(Res);
32016 return;
32017 }
32018
32019 return;
32020 }
32021 case ISD::MGATHER: {
32022 EVT VT = N->getValueType(0);
32023 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
32024 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
32025 auto *Gather = cast<MaskedGatherSDNode>(N);
32026 SDValue Index = Gather->getIndex();
32027 if (Index.getValueType() != MVT::v2i64)
32028 return;
32029 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32030, __extension__ __PRETTY_FUNCTION__))
32030 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32030, __extension__ __PRETTY_FUNCTION__))
;
32031 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32032 SDValue Mask = Gather->getMask();
32033 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32033, __extension__ __PRETTY_FUNCTION__))
;
32034 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
32035 Gather->getPassThru(),
32036 DAG.getUNDEF(VT));
32037 if (!Subtarget.hasVLX()) {
32038 // We need to widen the mask, but the instruction will only use 2
32039 // of its elements. So we can use undef.
32040 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
32041 DAG.getUNDEF(MVT::v2i1));
32042 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
32043 }
32044 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
32045 Gather->getBasePtr(), Index, Gather->getScale() };
32046 SDValue Res = DAG.getMemIntrinsicNode(
32047 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
32048 Gather->getMemoryVT(), Gather->getMemOperand());
32049 Results.push_back(Res);
32050 Results.push_back(Res.getValue(1));
32051 return;
32052 }
32053 return;
32054 }
32055 case ISD::LOAD: {
32056 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
32057 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
32058 // cast since type legalization will try to use an i64 load.
32059 MVT VT = N->getSimpleValueType(0);
32060 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32060, __extension__ __PRETTY_FUNCTION__))
;
32061 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32062, __extension__ __PRETTY_FUNCTION__))
32062 "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32062, __extension__ __PRETTY_FUNCTION__))
;
32063 if (!ISD::isNON_EXTLoad(N))
32064 return;
32065 auto *Ld = cast<LoadSDNode>(N);
32066 if (Subtarget.hasSSE2()) {
32067 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
32068 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
32069 Ld->getPointerInfo(), Ld->getOriginalAlign(),
32070 Ld->getMemOperand()->getFlags());
32071 SDValue Chain = Res.getValue(1);
32072 MVT VecVT = MVT::getVectorVT(LdVT, 2);
32073 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
32074 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
32075 Res = DAG.getBitcast(WideVT, Res);
32076 Results.push_back(Res);
32077 Results.push_back(Chain);
32078 return;
32079 }
32080 assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32080, __extension__ __PRETTY_FUNCTION__))
;
32081 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
32082 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
32083 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
32084 MVT::i64, Ld->getMemOperand());
32085 Results.push_back(Res);
32086 Results.push_back(Res.getValue(1));
32087 return;
32088 }
32089 case ISD::ADDRSPACECAST: {
32090 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
32091 Results.push_back(V);
32092 return;
32093 }
32094 case ISD::BITREVERSE:
32095 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32095, __extension__ __PRETTY_FUNCTION__))
;
32096 assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32096, __extension__ __PRETTY_FUNCTION__))
;
32097 // We can use VPPERM by copying to a vector register and back. We'll need
32098 // to move the scalar in two i32 pieces.
32099 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
32100 return;
32101 }
32102}
32103
32104const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
32105 switch ((X86ISD::NodeType)Opcode) {
32106 case X86ISD::FIRST_NUMBER: break;
32107#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
32108 NODE_NAME_CASE(BSF)
32109 NODE_NAME_CASE(BSR)
32110 NODE_NAME_CASE(FSHL)
32111 NODE_NAME_CASE(FSHR)
32112 NODE_NAME_CASE(FAND)
32113 NODE_NAME_CASE(FANDN)
32114 NODE_NAME_CASE(FOR)
32115 NODE_NAME_CASE(FXOR)
32116 NODE_NAME_CASE(FILD)
32117 NODE_NAME_CASE(FIST)
32118 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
32119 NODE_NAME_CASE(FLD)
32120 NODE_NAME_CASE(FST)
32121 NODE_NAME_CASE(CALL)
32122 NODE_NAME_CASE(CALL_RVMARKER)
32123 NODE_NAME_CASE(BT)
32124 NODE_NAME_CASE(CMP)
32125 NODE_NAME_CASE(FCMP)
32126 NODE_NAME_CASE(STRICT_FCMP)
32127 NODE_NAME_CASE(STRICT_FCMPS)
32128 NODE_NAME_CASE(COMI)
32129 NODE_NAME_CASE(UCOMI)
32130 NODE_NAME_CASE(CMPM)
32131 NODE_NAME_CASE(CMPMM)
32132 NODE_NAME_CASE(STRICT_CMPM)
32133 NODE_NAME_CASE(CMPMM_SAE)
32134 NODE_NAME_CASE(SETCC)
32135 NODE_NAME_CASE(SETCC_CARRY)
32136 NODE_NAME_CASE(FSETCC)
32137 NODE_NAME_CASE(FSETCCM)
32138 NODE_NAME_CASE(FSETCCM_SAE)
32139 NODE_NAME_CASE(CMOV)
32140 NODE_NAME_CASE(BRCOND)
32141 NODE_NAME_CASE(RET_FLAG)
32142 NODE_NAME_CASE(IRET)
32143 NODE_NAME_CASE(REP_STOS)
32144 NODE_NAME_CASE(REP_MOVS)
32145 NODE_NAME_CASE(GlobalBaseReg)
32146 NODE_NAME_CASE(Wrapper)
32147 NODE_NAME_CASE(WrapperRIP)
32148 NODE_NAME_CASE(MOVQ2DQ)
32149 NODE_NAME_CASE(MOVDQ2Q)
32150 NODE_NAME_CASE(MMX_MOVD2W)
32151 NODE_NAME_CASE(MMX_MOVW2D)
32152 NODE_NAME_CASE(PEXTRB)
32153 NODE_NAME_CASE(PEXTRW)
32154 NODE_NAME_CASE(INSERTPS)
32155 NODE_NAME_CASE(PINSRB)
32156 NODE_NAME_CASE(PINSRW)
32157 NODE_NAME_CASE(PSHUFB)
32158 NODE_NAME_CASE(ANDNP)
32159 NODE_NAME_CASE(BLENDI)
32160 NODE_NAME_CASE(BLENDV)
32161 NODE_NAME_CASE(HADD)
32162 NODE_NAME_CASE(HSUB)
32163 NODE_NAME_CASE(FHADD)
32164 NODE_NAME_CASE(FHSUB)
32165 NODE_NAME_CASE(CONFLICT)
32166 NODE_NAME_CASE(FMAX)
32167 NODE_NAME_CASE(FMAXS)
32168 NODE_NAME_CASE(FMAX_SAE)
32169 NODE_NAME_CASE(FMAXS_SAE)
32170 NODE_NAME_CASE(FMIN)
32171 NODE_NAME_CASE(FMINS)
32172 NODE_NAME_CASE(FMIN_SAE)
32173 NODE_NAME_CASE(FMINS_SAE)
32174 NODE_NAME_CASE(FMAXC)
32175 NODE_NAME_CASE(FMINC)
32176 NODE_NAME_CASE(FRSQRT)
32177 NODE_NAME_CASE(FRCP)
32178 NODE_NAME_CASE(EXTRQI)
32179 NODE_NAME_CASE(INSERTQI)
32180 NODE_NAME_CASE(TLSADDR)
32181 NODE_NAME_CASE(TLSBASEADDR)
32182 NODE_NAME_CASE(TLSCALL)
32183 NODE_NAME_CASE(EH_SJLJ_SETJMP)
32184 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
32185 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
32186 NODE_NAME_CASE(EH_RETURN)
32187 NODE_NAME_CASE(TC_RETURN)
32188 NODE_NAME_CASE(FNSTCW16m)
32189 NODE_NAME_CASE(FLDCW16m)
32190 NODE_NAME_CASE(LCMPXCHG_DAG)
32191 NODE_NAME_CASE(LCMPXCHG8_DAG)
32192 NODE_NAME_CASE(LCMPXCHG16_DAG)
32193 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
32194 NODE_NAME_CASE(LADD)
32195 NODE_NAME_CASE(LSUB)
32196 NODE_NAME_CASE(LOR)
32197 NODE_NAME_CASE(LXOR)
32198 NODE_NAME_CASE(LAND)
32199 NODE_NAME_CASE(VZEXT_MOVL)
32200 NODE_NAME_CASE(VZEXT_LOAD)
32201 NODE_NAME_CASE(VEXTRACT_STORE)
32202 NODE_NAME_CASE(VTRUNC)
32203 NODE_NAME_CASE(VTRUNCS)
32204 NODE_NAME_CASE(VTRUNCUS)
32205 NODE_NAME_CASE(VMTRUNC)
32206 NODE_NAME_CASE(VMTRUNCS)
32207 NODE_NAME_CASE(VMTRUNCUS)
32208 NODE_NAME_CASE(VTRUNCSTORES)
32209 NODE_NAME_CASE(VTRUNCSTOREUS)
32210 NODE_NAME_CASE(VMTRUNCSTORES)
32211 NODE_NAME_CASE(VMTRUNCSTOREUS)
32212 NODE_NAME_CASE(VFPEXT)
32213 NODE_NAME_CASE(STRICT_VFPEXT)
32214 NODE_NAME_CASE(VFPEXT_SAE)
32215 NODE_NAME_CASE(VFPEXTS)
32216 NODE_NAME_CASE(VFPEXTS_SAE)
32217 NODE_NAME_CASE(VFPROUND)
32218 NODE_NAME_CASE(STRICT_VFPROUND)
32219 NODE_NAME_CASE(VMFPROUND)
32220 NODE_NAME_CASE(VFPROUND_RND)
32221 NODE_NAME_CASE(VFPROUNDS)
32222 NODE_NAME_CASE(VFPROUNDS_RND)
32223 NODE_NAME_CASE(VSHLDQ)
32224 NODE_NAME_CASE(VSRLDQ)
32225 NODE_NAME_CASE(VSHL)
32226 NODE_NAME_CASE(VSRL)
32227 NODE_NAME_CASE(VSRA)
32228 NODE_NAME_CASE(VSHLI)
32229 NODE_NAME_CASE(VSRLI)
32230 NODE_NAME_CASE(VSRAI)
32231 NODE_NAME_CASE(VSHLV)
32232 NODE_NAME_CASE(VSRLV)
32233 NODE_NAME_CASE(VSRAV)
32234 NODE_NAME_CASE(VROTLI)
32235 NODE_NAME_CASE(VROTRI)
32236 NODE_NAME_CASE(VPPERM)
32237 NODE_NAME_CASE(CMPP)
32238 NODE_NAME_CASE(STRICT_CMPP)
32239 NODE_NAME_CASE(PCMPEQ)
32240 NODE_NAME_CASE(PCMPGT)
32241 NODE_NAME_CASE(PHMINPOS)
32242 NODE_NAME_CASE(ADD)
32243 NODE_NAME_CASE(SUB)
32244 NODE_NAME_CASE(ADC)
32245 NODE_NAME_CASE(SBB)
32246 NODE_NAME_CASE(SMUL)
32247 NODE_NAME_CASE(UMUL)
32248 NODE_NAME_CASE(OR)
32249 NODE_NAME_CASE(XOR)
32250 NODE_NAME_CASE(AND)
32251 NODE_NAME_CASE(BEXTR)
32252 NODE_NAME_CASE(BEXTRI)
32253 NODE_NAME_CASE(BZHI)
32254 NODE_NAME_CASE(PDEP)
32255 NODE_NAME_CASE(PEXT)
32256 NODE_NAME_CASE(MUL_IMM)
32257 NODE_NAME_CASE(MOVMSK)
32258 NODE_NAME_CASE(PTEST)
32259 NODE_NAME_CASE(TESTP)
32260 NODE_NAME_CASE(KORTEST)
32261 NODE_NAME_CASE(KTEST)
32262 NODE_NAME_CASE(KADD)
32263 NODE_NAME_CASE(KSHIFTL)
32264 NODE_NAME_CASE(KSHIFTR)
32265 NODE_NAME_CASE(PACKSS)
32266 NODE_NAME_CASE(PACKUS)
32267 NODE_NAME_CASE(PALIGNR)
32268 NODE_NAME_CASE(VALIGN)
32269 NODE_NAME_CASE(VSHLD)
32270 NODE_NAME_CASE(VSHRD)
32271 NODE_NAME_CASE(VSHLDV)
32272 NODE_NAME_CASE(VSHRDV)
32273 NODE_NAME_CASE(PSHUFD)
32274 NODE_NAME_CASE(PSHUFHW)
32275 NODE_NAME_CASE(PSHUFLW)
32276 NODE_NAME_CASE(SHUFP)
32277 NODE_NAME_CASE(SHUF128)
32278 NODE_NAME_CASE(MOVLHPS)
32279 NODE_NAME_CASE(MOVHLPS)
32280 NODE_NAME_CASE(MOVDDUP)
32281 NODE_NAME_CASE(MOVSHDUP)
32282 NODE_NAME_CASE(MOVSLDUP)
32283 NODE_NAME_CASE(MOVSD)
32284 NODE_NAME_CASE(MOVSS)
32285 NODE_NAME_CASE(MOVSH)
32286 NODE_NAME_CASE(UNPCKL)
32287 NODE_NAME_CASE(UNPCKH)
32288 NODE_NAME_CASE(VBROADCAST)
32289 NODE_NAME_CASE(VBROADCAST_LOAD)
32290 NODE_NAME_CASE(VBROADCASTM)
32291 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
32292 NODE_NAME_CASE(VPERMILPV)
32293 NODE_NAME_CASE(VPERMILPI)
32294 NODE_NAME_CASE(VPERM2X128)
32295 NODE_NAME_CASE(VPERMV)
32296 NODE_NAME_CASE(VPERMV3)
32297 NODE_NAME_CASE(VPERMI)
32298 NODE_NAME_CASE(VPTERNLOG)
32299 NODE_NAME_CASE(VFIXUPIMM)
32300 NODE_NAME_CASE(VFIXUPIMM_SAE)
32301 NODE_NAME_CASE(VFIXUPIMMS)
32302 NODE_NAME_CASE(VFIXUPIMMS_SAE)
32303 NODE_NAME_CASE(VRANGE)
32304 NODE_NAME_CASE(VRANGE_SAE)
32305 NODE_NAME_CASE(VRANGES)
32306 NODE_NAME_CASE(VRANGES_SAE)
32307 NODE_NAME_CASE(PMULUDQ)
32308 NODE_NAME_CASE(PMULDQ)
32309 NODE_NAME_CASE(PSADBW)
32310 NODE_NAME_CASE(DBPSADBW)
32311 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
32312 NODE_NAME_CASE(VAARG_64)
32313 NODE_NAME_CASE(VAARG_X32)
32314 NODE_NAME_CASE(WIN_ALLOCA)
32315 NODE_NAME_CASE(MEMBARRIER)
32316 NODE_NAME_CASE(MFENCE)
32317 NODE_NAME_CASE(SEG_ALLOCA)
32318 NODE_NAME_CASE(PROBED_ALLOCA)
32319 NODE_NAME_CASE(RDRAND)
32320 NODE_NAME_CASE(RDSEED)
32321 NODE_NAME_CASE(RDPKRU)
32322 NODE_NAME_CASE(WRPKRU)
32323 NODE_NAME_CASE(VPMADDUBSW)
32324 NODE_NAME_CASE(VPMADDWD)
32325 NODE_NAME_CASE(VPSHA)
32326 NODE_NAME_CASE(VPSHL)
32327 NODE_NAME_CASE(VPCOM)
32328 NODE_NAME_CASE(VPCOMU)
32329 NODE_NAME_CASE(VPERMIL2)
32330 NODE_NAME_CASE(FMSUB)
32331 NODE_NAME_CASE(STRICT_FMSUB)
32332 NODE_NAME_CASE(FNMADD)
32333 NODE_NAME_CASE(STRICT_FNMADD)
32334 NODE_NAME_CASE(FNMSUB)
32335 NODE_NAME_CASE(STRICT_FNMSUB)
32336 NODE_NAME_CASE(FMADDSUB)
32337 NODE_NAME_CASE(FMSUBADD)
32338 NODE_NAME_CASE(FMADD_RND)
32339 NODE_NAME_CASE(FNMADD_RND)
32340 NODE_NAME_CASE(FMSUB_RND)
32341 NODE_NAME_CASE(FNMSUB_RND)
32342 NODE_NAME_CASE(FMADDSUB_RND)
32343 NODE_NAME_CASE(FMSUBADD_RND)
32344 NODE_NAME_CASE(VPMADD52H)
32345 NODE_NAME_CASE(VPMADD52L)
32346 NODE_NAME_CASE(VRNDSCALE)
32347 NODE_NAME_CASE(STRICT_VRNDSCALE)
32348 NODE_NAME_CASE(VRNDSCALE_SAE)
32349 NODE_NAME_CASE(VRNDSCALES)
32350 NODE_NAME_CASE(VRNDSCALES_SAE)
32351 NODE_NAME_CASE(VREDUCE)
32352 NODE_NAME_CASE(VREDUCE_SAE)
32353 NODE_NAME_CASE(VREDUCES)
32354 NODE_NAME_CASE(VREDUCES_SAE)
32355 NODE_NAME_CASE(VGETMANT)
32356 NODE_NAME_CASE(VGETMANT_SAE)
32357 NODE_NAME_CASE(VGETMANTS)
32358 NODE_NAME_CASE(VGETMANTS_SAE)
32359 NODE_NAME_CASE(PCMPESTR)
32360 NODE_NAME_CASE(PCMPISTR)
32361 NODE_NAME_CASE(XTEST)
32362 NODE_NAME_CASE(COMPRESS)
32363 NODE_NAME_CASE(EXPAND)
32364 NODE_NAME_CASE(SELECTS)
32365 NODE_NAME_CASE(ADDSUB)
32366 NODE_NAME_CASE(RCP14)
32367 NODE_NAME_CASE(RCP14S)
32368 NODE_NAME_CASE(RCP28)
32369 NODE_NAME_CASE(RCP28_SAE)
32370 NODE_NAME_CASE(RCP28S)
32371 NODE_NAME_CASE(RCP28S_SAE)
32372 NODE_NAME_CASE(EXP2)
32373 NODE_NAME_CASE(EXP2_SAE)
32374 NODE_NAME_CASE(RSQRT14)
32375 NODE_NAME_CASE(RSQRT14S)
32376 NODE_NAME_CASE(RSQRT28)
32377 NODE_NAME_CASE(RSQRT28_SAE)
32378 NODE_NAME_CASE(RSQRT28S)
32379 NODE_NAME_CASE(RSQRT28S_SAE)
32380 NODE_NAME_CASE(FADD_RND)
32381 NODE_NAME_CASE(FADDS)
32382 NODE_NAME_CASE(FADDS_RND)
32383 NODE_NAME_CASE(FSUB_RND)
32384 NODE_NAME_CASE(FSUBS)
32385 NODE_NAME_CASE(FSUBS_RND)
32386 NODE_NAME_CASE(FMUL_RND)
32387 NODE_NAME_CASE(FMULS)
32388 NODE_NAME_CASE(FMULS_RND)
32389 NODE_NAME_CASE(FDIV_RND)
32390 NODE_NAME_CASE(FDIVS)
32391 NODE_NAME_CASE(FDIVS_RND)
32392 NODE_NAME_CASE(FSQRT_RND)
32393 NODE_NAME_CASE(FSQRTS)
32394 NODE_NAME_CASE(FSQRTS_RND)
32395 NODE_NAME_CASE(FGETEXP)
32396 NODE_NAME_CASE(FGETEXP_SAE)
32397 NODE_NAME_CASE(FGETEXPS)
32398 NODE_NAME_CASE(FGETEXPS_SAE)
32399 NODE_NAME_CASE(SCALEF)
32400 NODE_NAME_CASE(SCALEF_RND)
32401 NODE_NAME_CASE(SCALEFS)
32402 NODE_NAME_CASE(SCALEFS_RND)
32403 NODE_NAME_CASE(AVG)
32404 NODE_NAME_CASE(MULHRS)
32405 NODE_NAME_CASE(SINT_TO_FP_RND)
32406 NODE_NAME_CASE(UINT_TO_FP_RND)
32407 NODE_NAME_CASE(CVTTP2SI)
32408 NODE_NAME_CASE(CVTTP2UI)
32409 NODE_NAME_CASE(STRICT_CVTTP2SI)
32410 NODE_NAME_CASE(STRICT_CVTTP2UI)
32411 NODE_NAME_CASE(MCVTTP2SI)
32412 NODE_NAME_CASE(MCVTTP2UI)
32413 NODE_NAME_CASE(CVTTP2SI_SAE)
32414 NODE_NAME_CASE(CVTTP2UI_SAE)
32415 NODE_NAME_CASE(CVTTS2SI)
32416 NODE_NAME_CASE(CVTTS2UI)
32417 NODE_NAME_CASE(CVTTS2SI_SAE)
32418 NODE_NAME_CASE(CVTTS2UI_SAE)
32419 NODE_NAME_CASE(CVTSI2P)
32420 NODE_NAME_CASE(CVTUI2P)
32421 NODE_NAME_CASE(STRICT_CVTSI2P)
32422 NODE_NAME_CASE(STRICT_CVTUI2P)
32423 NODE_NAME_CASE(MCVTSI2P)
32424 NODE_NAME_CASE(MCVTUI2P)
32425 NODE_NAME_CASE(VFPCLASS)
32426 NODE_NAME_CASE(VFPCLASSS)
32427 NODE_NAME_CASE(MULTISHIFT)
32428 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
32429 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
32430 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
32431 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
32432 NODE_NAME_CASE(CVTPS2PH)
32433 NODE_NAME_CASE(STRICT_CVTPS2PH)
32434 NODE_NAME_CASE(MCVTPS2PH)
32435 NODE_NAME_CASE(CVTPH2PS)
32436 NODE_NAME_CASE(STRICT_CVTPH2PS)
32437 NODE_NAME_CASE(CVTPH2PS_SAE)
32438 NODE_NAME_CASE(CVTP2SI)
32439 NODE_NAME_CASE(CVTP2UI)
32440 NODE_NAME_CASE(MCVTP2SI)
32441 NODE_NAME_CASE(MCVTP2UI)
32442 NODE_NAME_CASE(CVTP2SI_RND)
32443 NODE_NAME_CASE(CVTP2UI_RND)
32444 NODE_NAME_CASE(CVTS2SI)
32445 NODE_NAME_CASE(CVTS2UI)
32446 NODE_NAME_CASE(CVTS2SI_RND)
32447 NODE_NAME_CASE(CVTS2UI_RND)
32448 NODE_NAME_CASE(CVTNE2PS2BF16)
32449 NODE_NAME_CASE(CVTNEPS2BF16)
32450 NODE_NAME_CASE(MCVTNEPS2BF16)
32451 NODE_NAME_CASE(DPBF16PS)
32452 NODE_NAME_CASE(LWPINS)
32453 NODE_NAME_CASE(MGATHER)
32454 NODE_NAME_CASE(MSCATTER)
32455 NODE_NAME_CASE(VPDPBUSD)
32456 NODE_NAME_CASE(VPDPBUSDS)
32457 NODE_NAME_CASE(VPDPWSSD)
32458 NODE_NAME_CASE(VPDPWSSDS)
32459 NODE_NAME_CASE(VPSHUFBITQMB)
32460 NODE_NAME_CASE(GF2P8MULB)
32461 NODE_NAME_CASE(GF2P8AFFINEQB)
32462 NODE_NAME_CASE(GF2P8AFFINEINVQB)
32463 NODE_NAME_CASE(NT_CALL)
32464 NODE_NAME_CASE(NT_BRIND)
32465 NODE_NAME_CASE(UMWAIT)
32466 NODE_NAME_CASE(TPAUSE)
32467 NODE_NAME_CASE(ENQCMD)
32468 NODE_NAME_CASE(ENQCMDS)
32469 NODE_NAME_CASE(VP2INTERSECT)
32470 NODE_NAME_CASE(AESENC128KL)
32471 NODE_NAME_CASE(AESDEC128KL)
32472 NODE_NAME_CASE(AESENC256KL)
32473 NODE_NAME_CASE(AESDEC256KL)
32474 NODE_NAME_CASE(AESENCWIDE128KL)
32475 NODE_NAME_CASE(AESDECWIDE128KL)
32476 NODE_NAME_CASE(AESENCWIDE256KL)
32477 NODE_NAME_CASE(AESDECWIDE256KL)
32478 NODE_NAME_CASE(TESTUI)
32479 }
32480 return nullptr;
32481#undef NODE_NAME_CASE
32482}
32483
32484/// Return true if the addressing mode represented by AM is legal for this
32485/// target, for a load/store of the specified type.
32486bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
32487 const AddrMode &AM, Type *Ty,
32488 unsigned AS,
32489 Instruction *I) const {
32490 // X86 supports extremely general addressing modes.
32491 CodeModel::Model M = getTargetMachine().getCodeModel();
32492
32493 // X86 allows a sign-extended 32-bit immediate field as a displacement.
32494 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
32495 return false;
32496
32497 if (AM.BaseGV) {
32498 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
32499
32500 // If a reference to this global requires an extra load, we can't fold it.
32501 if (isGlobalStubReference(GVFlags))
32502 return false;
32503
32504 // If BaseGV requires a register for the PIC base, we cannot also have a
32505 // BaseReg specified.
32506 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
32507 return false;
32508
32509 // If lower 4G is not available, then we must use rip-relative addressing.
32510 if ((M != CodeModel::Small || isPositionIndependent()) &&
32511 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
32512 return false;
32513 }
32514
32515 switch (AM.Scale) {
32516 case 0:
32517 case 1:
32518 case 2:
32519 case 4:
32520 case 8:
32521 // These scales always work.
32522 break;
32523 case 3:
32524 case 5:
32525 case 9:
32526 // These scales are formed with basereg+scalereg. Only accept if there is
32527 // no basereg yet.
32528 if (AM.HasBaseReg)
32529 return false;
32530 break;
32531 default: // Other stuff never works.
32532 return false;
32533 }
32534
32535 return true;
32536}
32537
32538bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
32539 unsigned Bits = Ty->getScalarSizeInBits();
32540
32541 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
32542 // particularly cheaper than those without.
32543 if (Bits == 8)
32544 return false;
32545
32546 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
32547 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
32548 if (Subtarget.hasXOP() &&
32549 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
32550 return false;
32551
32552 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
32553 // shifts just as cheap as scalar ones.
32554 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
32555 return false;
32556
32557 // AVX512BW has shifts such as vpsllvw.
32558 if (Subtarget.hasBWI() && Bits == 16)
32559 return false;
32560
32561 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
32562 // fully general vector.
32563 return true;
32564}
32565
32566bool X86TargetLowering::isBinOp(unsigned Opcode) const {
32567 switch (Opcode) {
32568 // These are non-commutative binops.
32569 // TODO: Add more X86ISD opcodes once we have test coverage.
32570 case X86ISD::ANDNP:
32571 case X86ISD::PCMPGT:
32572 case X86ISD::FMAX:
32573 case X86ISD::FMIN:
32574 case X86ISD::FANDN:
32575 return true;
32576 }
32577
32578 return TargetLoweringBase::isBinOp(Opcode);
32579}
32580
32581bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
32582 switch (Opcode) {
32583 // TODO: Add more X86ISD opcodes once we have test coverage.
32584 case X86ISD::PCMPEQ:
32585 case X86ISD::PMULDQ:
32586 case X86ISD::PMULUDQ:
32587 case X86ISD::FMAXC:
32588 case X86ISD::FMINC:
32589 case X86ISD::FAND:
32590 case X86ISD::FOR:
32591 case X86ISD::FXOR:
32592 return true;
32593 }
32594
32595 return TargetLoweringBase::isCommutativeBinOp(Opcode);
32596}
32597
32598bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
32599 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
32600 return false;
32601 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
32602 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
32603 return NumBits1 > NumBits2;
32604}
32605
32606bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
32607 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
32608 return false;
32609
32610 if (!isTypeLegal(EVT::getEVT(Ty1)))
32611 return false;
32612
32613 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32613, __extension__ __PRETTY_FUNCTION__))
;
32614
32615 // Assuming the caller doesn't have a zeroext or signext return parameter,
32616 // truncation all the way down to i1 is valid.
32617 return true;
32618}
32619
32620bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
32621 return isInt<32>(Imm);
32622}
32623
32624bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
32625 // Can also use sub to handle negated immediates.
32626 return isInt<32>(Imm);
32627}
32628
32629bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
32630 return isInt<32>(Imm);
32631}
32632
32633bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
32634 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
32635 return false;
32636 unsigned NumBits1 = VT1.getSizeInBits();
32637 unsigned NumBits2 = VT2.getSizeInBits();
32638 return NumBits1 > NumBits2;
32639}
32640
32641bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
32642 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32643 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
32644}
32645
32646bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
32647 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
32648 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
32649}
32650
32651bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
32652 EVT VT1 = Val.getValueType();
32653 if (isZExtFree(VT1, VT2))
32654 return true;
32655
32656 if (Val.getOpcode() != ISD::LOAD)
32657 return false;
32658
32659 if (!VT1.isSimple() || !VT1.isInteger() ||
32660 !VT2.isSimple() || !VT2.isInteger())
32661 return false;
32662
32663 switch (VT1.getSimpleVT().SimpleTy) {
32664 default: break;
32665 case MVT::i8:
32666 case MVT::i16:
32667 case MVT::i32:
32668 // X86 has 8, 16, and 32-bit zero-extending loads.
32669 return true;
32670 }
32671
32672 return false;
32673}
32674
32675bool X86TargetLowering::shouldSinkOperands(Instruction *I,
32676 SmallVectorImpl<Use *> &Ops) const {
32677 using namespace llvm::PatternMatch;
32678
32679 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
32680 if (!VTy)
32681 return false;
32682
32683 if (I->getOpcode() == Instruction::Mul &&
32684 VTy->getElementType()->isIntegerTy(64)) {
32685 for (auto &Op : I->operands()) {
32686 // Make sure we are not already sinking this operand
32687 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
32688 continue;
32689
32690 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
32691 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
32692 if (Subtarget.hasSSE41() &&
32693 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
32694 m_SpecificInt(32)))) {
32695 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
32696 Ops.push_back(&Op);
32697 } else if (Subtarget.hasSSE2() &&
32698 match(Op.get(),
32699 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {
32700 Ops.push_back(&Op);
32701 }
32702 }
32703
32704 return !Ops.empty();
32705 }
32706
32707 // A uniform shift amount in a vector shift or funnel shift may be much
32708 // cheaper than a generic variable vector shift, so make that pattern visible
32709 // to SDAG by sinking the shuffle instruction next to the shift.
32710 int ShiftAmountOpNum = -1;
32711 if (I->isShift())
32712 ShiftAmountOpNum = 1;
32713 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
32714 if (II->getIntrinsicID() == Intrinsic::fshl ||
32715 II->getIntrinsicID() == Intrinsic::fshr)
32716 ShiftAmountOpNum = 2;
32717 }
32718
32719 if (ShiftAmountOpNum == -1)
32720 return false;
32721
32722 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
32723 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
32724 isVectorShiftByScalarCheap(I->getType())) {
32725 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
32726 return true;
32727 }
32728
32729 return false;
32730}
32731
32732bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
32733 if (!Subtarget.is64Bit())
32734 return false;
32735 return TargetLowering::shouldConvertPhiType(From, To);
32736}
32737
32738bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
32739 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
32740 return false;
32741
32742 EVT SrcVT = ExtVal.getOperand(0).getValueType();
32743
32744 // There is no extending load for vXi1.
32745 if (SrcVT.getScalarType() == MVT::i1)
32746 return false;
32747
32748 return true;
32749}
32750
32751bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
32752 EVT VT) const {
32753 if (!Subtarget.hasAnyFMA())
32754 return false;
32755
32756 VT = VT.getScalarType();
32757
32758 if (!VT.isSimple())
32759 return false;
32760
32761 switch (VT.getSimpleVT().SimpleTy) {
32762 case MVT::f16:
32763 return Subtarget.hasFP16();
32764 case MVT::f32:
32765 case MVT::f64:
32766 return true;
32767 default:
32768 break;
32769 }
32770
32771 return false;
32772}
32773
32774bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
32775 // i16 instructions are longer (0x66 prefix) and potentially slower.
32776 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
32777}
32778
32779/// Targets can use this to indicate that they only support *some*
32780/// VECTOR_SHUFFLE operations, those with specific masks.
32781/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
32782/// are assumed to be legal.
32783bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
32784 if (!VT.isSimple())
32785 return false;
32786
32787 // Not for i1 vectors
32788 if (VT.getSimpleVT().getScalarType() == MVT::i1)
32789 return false;
32790
32791 // Very little shuffling can be done for 64-bit vectors right now.
32792 if (VT.getSimpleVT().getSizeInBits() == 64)
32793 return false;
32794
32795 // We only care that the types being shuffled are legal. The lowering can
32796 // handle any possible shuffle mask that results.
32797 return isTypeLegal(VT.getSimpleVT());
32798}
32799
32800bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
32801 EVT VT) const {
32802 // Don't convert an 'and' into a shuffle that we don't directly support.
32803 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
32804 if (!Subtarget.hasAVX2())
32805 if (VT == MVT::v32i8 || VT == MVT::v16i16)
32806 return false;
32807
32808 // Just delegate to the generic legality, clear masks aren't special.
32809 return isShuffleMaskLegal(Mask, VT);
32810}
32811
32812bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
32813 // If the subtarget is using thunks, we need to not generate jump tables.
32814 if (Subtarget.useIndirectThunkBranches())
32815 return false;
32816
32817 // Otherwise, fallback on the generic logic.
32818 return TargetLowering::areJTsAllowed(Fn);
32819}
32820
32821//===----------------------------------------------------------------------===//
32822// X86 Scheduler Hooks
32823//===----------------------------------------------------------------------===//
32824
32825// Returns true if EFLAG is consumed after this iterator in the rest of the
32826// basic block or any successors of the basic block.
32827static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
32828 MachineBasicBlock *BB) {
32829 // Scan forward through BB for a use/def of EFLAGS.
32830 for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
32831 miI != miE; ++miI) {
32832 const MachineInstr& mi = *miI;
32833 if (mi.readsRegister(X86::EFLAGS))
32834 return true;
32835 // If we found a def, we can stop searching.
32836 if (mi.definesRegister(X86::EFLAGS))
32837 return false;
32838 }
32839
32840 // If we hit the end of the block, check whether EFLAGS is live into a
32841 // successor.
32842 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
32843 sEnd = BB->succ_end();
32844 sItr != sEnd; ++sItr) {
32845 MachineBasicBlock* succ = *sItr;
32846 if (succ->isLiveIn(X86::EFLAGS))
32847 return true;
32848 }
32849
32850 return false;
32851}
32852
32853/// Utility function to emit xbegin specifying the start of an RTM region.
32854static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
32855 const TargetInstrInfo *TII) {
32856 const DebugLoc &DL = MI.getDebugLoc();
32857
32858 const BasicBlock *BB = MBB->getBasicBlock();
32859 MachineFunction::iterator I = ++MBB->getIterator();
32860
32861 // For the v = xbegin(), we generate
32862 //
32863 // thisMBB:
32864 // xbegin sinkMBB
32865 //
32866 // mainMBB:
32867 // s0 = -1
32868 //
32869 // fallBB:
32870 // eax = # XABORT_DEF
32871 // s1 = eax
32872 //
32873 // sinkMBB:
32874 // v = phi(s0/mainBB, s1/fallBB)
32875
32876 MachineBasicBlock *thisMBB = MBB;
32877 MachineFunction *MF = MBB->getParent();
32878 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
32879 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
32880 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32881 MF->insert(I, mainMBB);
32882 MF->insert(I, fallMBB);
32883 MF->insert(I, sinkMBB);
32884
32885 if (isEFLAGSLiveAfter(MI, MBB)) {
32886 mainMBB->addLiveIn(X86::EFLAGS);
32887 fallMBB->addLiveIn(X86::EFLAGS);
32888 sinkMBB->addLiveIn(X86::EFLAGS);
32889 }
32890
32891 // Transfer the remainder of BB and its successor edges to sinkMBB.
32892 sinkMBB->splice(sinkMBB->begin(), MBB,
32893 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32894 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32895
32896 MachineRegisterInfo &MRI = MF->getRegInfo();
32897 Register DstReg = MI.getOperand(0).getReg();
32898 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
32899 Register mainDstReg = MRI.createVirtualRegister(RC);
32900 Register fallDstReg = MRI.createVirtualRegister(RC);
32901
32902 // thisMBB:
32903 // xbegin fallMBB
32904 // # fallthrough to mainMBB
32905 // # abortion to fallMBB
32906 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
32907 thisMBB->addSuccessor(mainMBB);
32908 thisMBB->addSuccessor(fallMBB);
32909
32910 // mainMBB:
32911 // mainDstReg := -1
32912 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
32913 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32914 mainMBB->addSuccessor(sinkMBB);
32915
32916 // fallMBB:
32917 // ; pseudo instruction to model hardware's definition from XABORT
32918 // EAX := XABORT_DEF
32919 // fallDstReg := EAX
32920 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
32921 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
32922 .addReg(X86::EAX);
32923 fallMBB->addSuccessor(sinkMBB);
32924
32925 // sinkMBB:
32926 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
32927 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
32928 .addReg(mainDstReg).addMBB(mainMBB)
32929 .addReg(fallDstReg).addMBB(fallMBB);
32930
32931 MI.eraseFromParent();
32932 return sinkMBB;
32933}
32934
32935MachineBasicBlock *
32936X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
32937 MachineBasicBlock *MBB) const {
32938 // Emit va_arg instruction on X86-64.
32939
32940 // Operands to this pseudo-instruction:
32941 // 0 ) Output : destination address (reg)
32942 // 1-5) Input : va_list address (addr, i64mem)
32943 // 6 ) ArgSize : Size (in bytes) of vararg type
32944 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
32945 // 8 ) Align : Alignment of type
32946 // 9 ) EFLAGS (implicit-def)
32947
32948 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32948, __extension__ __PRETTY_FUNCTION__))
;
32949 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
32950
32951 Register DestReg = MI.getOperand(0).getReg();
32952 MachineOperand &Base = MI.getOperand(1);
32953 MachineOperand &Scale = MI.getOperand(2);
32954 MachineOperand &Index = MI.getOperand(3);
32955 MachineOperand &Disp = MI.getOperand(4);
32956 MachineOperand &Segment = MI.getOperand(5);
32957 unsigned ArgSize = MI.getOperand(6).getImm();
32958 unsigned ArgMode = MI.getOperand(7).getImm();
32959 Align Alignment = Align(MI.getOperand(8).getImm());
32960
32961 MachineFunction *MF = MBB->getParent();
32962
32963 // Memory Reference
32964 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32964, __extension__ __PRETTY_FUNCTION__))
;
32965
32966 MachineMemOperand *OldMMO = MI.memoperands().front();
32967
32968 // Clone the MMO into two separate MMOs for loading and storing
32969 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
32970 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
32971 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
32972 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
32973
32974 // Machine Information
32975 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32976 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
32977 const TargetRegisterClass *AddrRegClass =
32978 getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));
32979 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
32980 const DebugLoc &DL = MI.getDebugLoc();
32981
32982 // struct va_list {
32983 // i32 gp_offset
32984 // i32 fp_offset
32985 // i64 overflow_area (address)
32986 // i64 reg_save_area (address)
32987 // }
32988 // sizeof(va_list) = 24
32989 // alignment(va_list) = 8
32990
32991 unsigned TotalNumIntRegs = 6;
32992 unsigned TotalNumXMMRegs = 8;
32993 bool UseGPOffset = (ArgMode == 1);
32994 bool UseFPOffset = (ArgMode == 2);
32995 unsigned MaxOffset = TotalNumIntRegs * 8 +
32996 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
32997
32998 /* Align ArgSize to a multiple of 8 */
32999 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
33000 bool NeedsAlign = (Alignment > 8);
33001
33002 MachineBasicBlock *thisMBB = MBB;
33003 MachineBasicBlock *overflowMBB;
33004 MachineBasicBlock *offsetMBB;
33005 MachineBasicBlock *endMBB;
33006
33007 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
33008 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
33009 unsigned OffsetReg = 0;
33010
33011 if (!UseGPOffset && !UseFPOffset) {
33012 // If we only pull from the overflow region, we don't create a branch.
33013 // We don't need to alter control flow.
33014 OffsetDestReg = 0; // unused
33015 OverflowDestReg = DestReg;
33016
33017 offsetMBB = nullptr;
33018 overflowMBB = thisMBB;
33019 endMBB = thisMBB;
33020 } else {
33021 // First emit code to check if gp_offset (or fp_offset) is below the bound.
33022 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
33023 // If not, pull from overflow_area. (branch to overflowMBB)
33024 //
33025 // thisMBB
33026 // | .
33027 // | .
33028 // offsetMBB overflowMBB
33029 // | .
33030 // | .
33031 // endMBB
33032
33033 // Registers for the PHI in endMBB
33034 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
33035 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
33036
33037 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
33038 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33039 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33040 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33041
33042 MachineFunction::iterator MBBIter = ++MBB->getIterator();
33043
33044 // Insert the new basic blocks
33045 MF->insert(MBBIter, offsetMBB);
33046 MF->insert(MBBIter, overflowMBB);
33047 MF->insert(MBBIter, endMBB);
33048
33049 // Transfer the remainder of MBB and its successor edges to endMBB.
33050 endMBB->splice(endMBB->begin(), thisMBB,
33051 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
33052 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
33053
33054 // Make offsetMBB and overflowMBB successors of thisMBB
33055 thisMBB->addSuccessor(offsetMBB);
33056 thisMBB->addSuccessor(overflowMBB);
33057
33058 // endMBB is a successor of both offsetMBB and overflowMBB
33059 offsetMBB->addSuccessor(endMBB);
33060 overflowMBB->addSuccessor(endMBB);
33061
33062 // Load the offset value into a register
33063 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
33064 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
33065 .add(Base)
33066 .add(Scale)
33067 .add(Index)
33068 .addDisp(Disp, UseFPOffset ? 4 : 0)
33069 .add(Segment)
33070 .setMemRefs(LoadOnlyMMO);
33071
33072 // Check if there is enough room left to pull this argument.
33073 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
33074 .addReg(OffsetReg)
33075 .addImm(MaxOffset + 8 - ArgSizeA8);
33076
33077 // Branch to "overflowMBB" if offset >= max
33078 // Fall through to "offsetMBB" otherwise
33079 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
33080 .addMBB(overflowMBB).addImm(X86::COND_AE);
33081 }
33082
33083 // In offsetMBB, emit code to use the reg_save_area.
33084 if (offsetMBB) {
33085 assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33085, __extension__ __PRETTY_FUNCTION__))
;
33086
33087 // Read the reg_save_area address.
33088 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
33089 BuildMI(
33090 offsetMBB, DL,
33091 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
33092 RegSaveReg)
33093 .add(Base)
33094 .add(Scale)
33095 .add(Index)
33096 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
33097 .add(Segment)
33098 .setMemRefs(LoadOnlyMMO);
33099
33100 if (Subtarget.isTarget64BitLP64()) {
33101 // Zero-extend the offset
33102 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
33103 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
33104 .addImm(0)
33105 .addReg(OffsetReg)
33106 .addImm(X86::sub_32bit);
33107
33108 // Add the offset to the reg_save_area to get the final address.
33109 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
33110 .addReg(OffsetReg64)
33111 .addReg(RegSaveReg);
33112 } else {
33113 // Add the offset to the reg_save_area to get the final address.
33114 BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)
33115 .addReg(OffsetReg)
33116 .addReg(RegSaveReg);
33117 }
33118
33119 // Compute the offset for the next argument
33120 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
33121 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
33122 .addReg(OffsetReg)
33123 .addImm(UseFPOffset ? 16 : 8);
33124
33125 // Store it back into the va_list.
33126 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
33127 .add(Base)
33128 .add(Scale)
33129 .add(Index)
33130 .addDisp(Disp, UseFPOffset ? 4 : 0)
33131 .add(Segment)
33132 .addReg(NextOffsetReg)
33133 .setMemRefs(StoreOnlyMMO);
33134
33135 // Jump to endMBB
33136 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
33137 .addMBB(endMBB);
33138 }
33139
33140 //
33141 // Emit code to use overflow area
33142 //
33143
33144 // Load the overflow_area address into a register.
33145 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
33146 BuildMI(overflowMBB, DL,
33147 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
33148 OverflowAddrReg)
33149 .add(Base)
33150 .add(Scale)
33151 .add(Index)
33152 .addDisp(Disp, 8)
33153 .add(Segment)
33154 .setMemRefs(LoadOnlyMMO);
33155
33156 // If we need to align it, do so. Otherwise, just copy the address
33157 // to OverflowDestReg.
33158 if (NeedsAlign) {
33159 // Align the overflow address
33160 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
33161
33162 // aligned_addr = (addr + (align-1)) & ~(align-1)
33163 BuildMI(
33164 overflowMBB, DL,
33165 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
33166 TmpReg)
33167 .addReg(OverflowAddrReg)
33168 .addImm(Alignment.value() - 1);
33169
33170 BuildMI(
33171 overflowMBB, DL,
33172 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
33173 OverflowDestReg)
33174 .addReg(TmpReg)
33175 .addImm(~(uint64_t)(Alignment.value() - 1));
33176 } else {
33177 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
33178 .addReg(OverflowAddrReg);
33179 }
33180
33181 // Compute the next overflow address after this argument.
33182 // (the overflow address should be kept 8-byte aligned)
33183 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
33184 BuildMI(
33185 overflowMBB, DL,
33186 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
33187 NextAddrReg)
33188 .addReg(OverflowDestReg)
33189 .addImm(ArgSizeA8);
33190
33191 // Store the new overflow address.
33192 BuildMI(overflowMBB, DL,
33193 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
33194 .add(Base)
33195 .add(Scale)
33196 .add(Index)
33197 .addDisp(Disp, 8)
33198 .add(Segment)
33199 .addReg(NextAddrReg)
33200 .setMemRefs(StoreOnlyMMO);
33201
33202 // If we branched, emit the PHI to the front of endMBB.
33203 if (offsetMBB) {
33204 BuildMI(*endMBB, endMBB->begin(), DL,
33205 TII->get(X86::PHI), DestReg)
33206 .addReg(OffsetDestReg).addMBB(offsetMBB)
33207 .addReg(OverflowDestReg).addMBB(overflowMBB);
33208 }
33209
33210 // Erase the pseudo instruction
33211 MI.eraseFromParent();
33212
33213 return endMBB;
33214}
33215
33216// The EFLAGS operand of SelectItr might be missing a kill marker
33217// because there were multiple uses of EFLAGS, and ISel didn't know
33218// which to mark. Figure out whether SelectItr should have had a
33219// kill marker, and set it if it should. Returns the correct kill
33220// marker value.
33221static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
33222 MachineBasicBlock* BB,
33223 const TargetRegisterInfo* TRI) {
33224 if (isEFLAGSLiveAfter(SelectItr, BB))
33225 return false;
33226
33227 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
33228 // out. SelectMI should have a kill flag on EFLAGS.
33229 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
33230 return true;
33231}
33232
33233// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
33234// together with other CMOV pseudo-opcodes into a single basic-block with
33235// conditional jump around it.
33236static bool isCMOVPseudo(MachineInstr &MI) {
33237 switch (MI.getOpcode()) {
33238 case X86::CMOV_FR16X:
33239 case X86::CMOV_FR32:
33240 case X86::CMOV_FR32X:
33241 case X86::CMOV_FR64:
33242 case X86::CMOV_FR64X:
33243 case X86::CMOV_GR8:
33244 case X86::CMOV_GR16:
33245 case X86::CMOV_GR32:
33246 case X86::CMOV_RFP32:
33247 case X86::CMOV_RFP64:
33248 case X86::CMOV_RFP80:
33249 case X86::CMOV_VR64:
33250 case X86::CMOV_VR128:
33251 case X86::CMOV_VR128X:
33252 case X86::CMOV_VR256:
33253 case X86::CMOV_VR256X:
33254 case X86::CMOV_VR512:
33255 case X86::CMOV_VK1:
33256 case X86::CMOV_VK2:
33257 case X86::CMOV_VK4:
33258 case X86::CMOV_VK8:
33259 case X86::CMOV_VK16:
33260 case X86::CMOV_VK32:
33261 case X86::CMOV_VK64:
33262 return true;
33263
33264 default:
33265 return false;
33266 }
33267}
33268
33269// Helper function, which inserts PHI functions into SinkMBB:
33270// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
33271// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
33272// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
33273// the last PHI function inserted.
33274static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
33275 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
33276 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
33277 MachineBasicBlock *SinkMBB) {
33278 MachineFunction *MF = TrueMBB->getParent();
33279 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
33280 const DebugLoc &DL = MIItBegin->getDebugLoc();
33281
33282 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
33283 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
33284
33285 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
33286
33287 // As we are creating the PHIs, we have to be careful if there is more than
33288 // one. Later CMOVs may reference the results of earlier CMOVs, but later
33289 // PHIs have to reference the individual true/false inputs from earlier PHIs.
33290 // That also means that PHI construction must work forward from earlier to
33291 // later, and that the code must maintain a mapping from earlier PHI's
33292 // destination registers, and the registers that went into the PHI.
33293 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
33294 MachineInstrBuilder MIB;
33295
33296 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
33297 Register DestReg = MIIt->getOperand(0).getReg();
33298 Register Op1Reg = MIIt->getOperand(1).getReg();
33299 Register Op2Reg = MIIt->getOperand(2).getReg();
33300
33301 // If this CMOV we are generating is the opposite condition from
33302 // the jump we generated, then we have to swap the operands for the
33303 // PHI that is going to be generated.
33304 if (MIIt->getOperand(3).getImm() == OppCC)
33305 std::swap(Op1Reg, Op2Reg);
33306
33307 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
33308 Op1Reg = RegRewriteTable[Op1Reg].first;
33309
33310 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
33311 Op2Reg = RegRewriteTable[Op2Reg].second;
33312
33313 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
33314 .addReg(Op1Reg)
33315 .addMBB(FalseMBB)
33316 .addReg(Op2Reg)
33317 .addMBB(TrueMBB);
33318
33319 // Add this PHI to the rewrite table.
33320 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
33321 }
33322
33323 return MIB;
33324}
33325
33326// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
33327MachineBasicBlock *
33328X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
33329 MachineInstr &SecondCascadedCMOV,
33330 MachineBasicBlock *ThisMBB) const {
33331 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33332 const DebugLoc &DL = FirstCMOV.getDebugLoc();
33333
33334 // We lower cascaded CMOVs such as
33335 //
33336 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
33337 //
33338 // to two successive branches.
33339 //
33340 // Without this, we would add a PHI between the two jumps, which ends up
33341 // creating a few copies all around. For instance, for
33342 //
33343 // (sitofp (zext (fcmp une)))
33344 //
33345 // we would generate:
33346 //
33347 // ucomiss %xmm1, %xmm0
33348 // movss <1.0f>, %xmm0
33349 // movaps %xmm0, %xmm1
33350 // jne .LBB5_2
33351 // xorps %xmm1, %xmm1
33352 // .LBB5_2:
33353 // jp .LBB5_4
33354 // movaps %xmm1, %xmm0
33355 // .LBB5_4:
33356 // retq
33357 //
33358 // because this custom-inserter would have generated:
33359 //
33360 // A
33361 // | \
33362 // | B
33363 // | /
33364 // C
33365 // | \
33366 // | D
33367 // | /
33368 // E
33369 //
33370 // A: X = ...; Y = ...
33371 // B: empty
33372 // C: Z = PHI [X, A], [Y, B]
33373 // D: empty
33374 // E: PHI [X, C], [Z, D]
33375 //
33376 // If we lower both CMOVs in a single step, we can instead generate:
33377 //
33378 // A
33379 // | \
33380 // | C
33381 // | /|
33382 // |/ |
33383 // | |
33384 // | D
33385 // | /
33386 // E
33387 //
33388 // A: X = ...; Y = ...
33389 // D: empty
33390 // E: PHI [X, A], [X, C], [Y, D]
33391 //
33392 // Which, in our sitofp/fcmp example, gives us something like:
33393 //
33394 // ucomiss %xmm1, %xmm0
33395 // movss <1.0f>, %xmm0
33396 // jne .LBB5_4
33397 // jp .LBB5_4
33398 // xorps %xmm0, %xmm0
33399 // .LBB5_4:
33400 // retq
33401 //
33402
33403 // We lower cascaded CMOV into two successive branches to the same block.
33404 // EFLAGS is used by both, so mark it as live in the second.
33405 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
33406 MachineFunction *F = ThisMBB->getParent();
33407 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
33408 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
33409 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
33410
33411 MachineFunction::iterator It = ++ThisMBB->getIterator();
33412 F->insert(It, FirstInsertedMBB);
33413 F->insert(It, SecondInsertedMBB);
33414 F->insert(It, SinkMBB);
33415
33416 // For a cascaded CMOV, we lower it to two successive branches to
33417 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
33418 // the FirstInsertedMBB.
33419 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
33420
33421 // If the EFLAGS register isn't dead in the terminator, then claim that it's
33422 // live into the sink and copy blocks.
33423 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
33424 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
33425 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
33426 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
33427 SinkMBB->addLiveIn(X86::EFLAGS);
33428 }
33429
33430 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
33431 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
33432 std::next(MachineBasicBlock::iterator(FirstCMOV)),
33433 ThisMBB->end());
33434 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
33435
33436 // Fallthrough block for ThisMBB.
33437 ThisMBB->addSuccessor(FirstInsertedMBB);
33438 // The true block target of the first branch is always SinkMBB.
33439 ThisMBB->addSuccessor(SinkMBB);
33440 // Fallthrough block for FirstInsertedMBB.
33441 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
33442 // The true block for the branch of FirstInsertedMBB.
33443 FirstInsertedMBB->addSuccessor(SinkMBB);
33444 // This is fallthrough.
33445 SecondInsertedMBB->addSuccessor(SinkMBB);
33446
33447 // Create the conditional branch instructions.
33448 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
33449 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
33450
33451 X86::CondCode SecondCC =
33452 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
33453 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
33454
33455 // SinkMBB:
33456 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
33457 Register DestReg = FirstCMOV.getOperand(0).getReg();
33458 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
33459 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
33460 MachineInstrBuilder MIB =
33461 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
33462 .addReg(Op1Reg)
33463 .addMBB(SecondInsertedMBB)
33464 .addReg(Op2Reg)
33465 .addMBB(ThisMBB);
33466
33467 // The second SecondInsertedMBB provides the same incoming value as the
33468 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
33469 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
33470 // Copy the PHI result to the register defined by the second CMOV.
33471 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
33472 TII->get(TargetOpcode::COPY),
33473 SecondCascadedCMOV.getOperand(0).getReg())
33474 .addReg(FirstCMOV.getOperand(0).getReg());
33475
33476 // Now remove the CMOVs.
33477 FirstCMOV.eraseFromParent();
33478 SecondCascadedCMOV.eraseFromParent();
33479
33480 return SinkMBB;
33481}
33482
33483MachineBasicBlock *
33484X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
33485 MachineBasicBlock *ThisMBB) const {
33486 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33487 const DebugLoc &DL = MI.getDebugLoc();
33488
33489 // To "insert" a SELECT_CC instruction, we actually have to insert the
33490 // diamond control-flow pattern. The incoming instruction knows the
33491 // destination vreg to set, the condition code register to branch on, the
33492 // true/false values to select between and a branch opcode to use.
33493
33494 // ThisMBB:
33495 // ...
33496 // TrueVal = ...
33497 // cmpTY ccX, r1, r2
33498 // bCC copy1MBB
33499 // fallthrough --> FalseMBB
33500
33501 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
33502 // as described above, by inserting a BB, and then making a PHI at the join
33503 // point to select the true and false operands of the CMOV in the PHI.
33504 //
33505 // The code also handles two different cases of multiple CMOV opcodes
33506 // in a row.
33507 //
33508 // Case 1:
33509 // In this case, there are multiple CMOVs in a row, all which are based on
33510 // the same condition setting (or the exact opposite condition setting).
33511 // In this case we can lower all the CMOVs using a single inserted BB, and
33512 // then make a number of PHIs at the join point to model the CMOVs. The only
33513 // trickiness here, is that in a case like:
33514 //
33515 // t2 = CMOV cond1 t1, f1
33516 // t3 = CMOV cond1 t2, f2
33517 //
33518 // when rewriting this into PHIs, we have to perform some renaming on the
33519 // temps since you cannot have a PHI operand refer to a PHI result earlier
33520 // in the same block. The "simple" but wrong lowering would be:
33521 //
33522 // t2 = PHI t1(BB1), f1(BB2)
33523 // t3 = PHI t2(BB1), f2(BB2)
33524 //
33525 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
33526 // renaming is to note that on the path through BB1, t2 is really just a
33527 // copy of t1, and do that renaming, properly generating:
33528 //
33529 // t2 = PHI t1(BB1), f1(BB2)
33530 // t3 = PHI t1(BB1), f2(BB2)
33531 //
33532 // Case 2:
33533 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
33534 // function - EmitLoweredCascadedSelect.
33535
33536 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
33537 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
33538 MachineInstr *LastCMOV = &MI;
33539 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
33540
33541 // Check for case 1, where there are multiple CMOVs with the same condition
33542 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
33543 // number of jumps the most.
33544
33545 if (isCMOVPseudo(MI)) {
33546 // See if we have a string of CMOVS with the same condition. Skip over
33547 // intervening debug insts.
33548 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
33549 (NextMIIt->getOperand(3).getImm() == CC ||
33550 NextMIIt->getOperand(3).getImm() == OppCC)) {
33551 LastCMOV = &*NextMIIt;
33552 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
33553 }
33554 }
33555
33556 // This checks for case 2, but only do this if we didn't already find
33557 // case 1, as indicated by LastCMOV == MI.
33558 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
33559 NextMIIt->getOpcode() == MI.getOpcode() &&
33560 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
33561 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
33562 NextMIIt->getOperand(1).isKill()) {
33563 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
33564 }
33565
33566 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
33567 MachineFunction *F = ThisMBB->getParent();
33568 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
33569 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
33570
33571 MachineFunction::iterator It = ++ThisMBB->getIterator();
33572 F->insert(It, FalseMBB);
33573 F->insert(It, SinkMBB);
33574
33575 // If the EFLAGS register isn't dead in the terminator, then claim that it's
33576 // live into the sink and copy blocks.
33577 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
33578 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
33579 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
33580 FalseMBB->addLiveIn(X86::EFLAGS);
33581 SinkMBB->addLiveIn(X86::EFLAGS);
33582 }
33583
33584 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
33585 auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
33586 auto DbgIt = MachineBasicBlock::iterator(MI);
33587 while (DbgIt != DbgEnd) {
33588 auto Next = std::next(DbgIt);
33589 if (DbgIt->isDebugInstr())
33590 SinkMBB->push_back(DbgIt->removeFromParent());
33591 DbgIt = Next;
33592 }
33593
33594 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
33595 SinkMBB->splice(SinkMBB->end(), ThisMBB,
33596 std::next(MachineBasicBlock::iterator(LastCMOV)),
33597 ThisMBB->end());
33598 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
33599
33600 // Fallthrough block for ThisMBB.
33601 ThisMBB->addSuccessor(FalseMBB);
33602 // The true block target of the first (or only) branch is always a SinkMBB.
33603 ThisMBB->addSuccessor(SinkMBB);
33604 // Fallthrough block for FalseMBB.
33605 FalseMBB->addSuccessor(SinkMBB);
33606
33607 // Create the conditional branch instruction.
33608 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
33609
33610 // SinkMBB:
33611 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
33612 // ...
33613 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
33614 MachineBasicBlock::iterator MIItEnd =
33615 std::next(MachineBasicBlock::iterator(LastCMOV));
33616 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
33617
33618 // Now remove the CMOV(s).
33619 ThisMBB->erase(MIItBegin, MIItEnd);
33620
33621 return SinkMBB;
33622}
33623
33624static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
33625 if (IsLP64) {
33626 if (isInt<8>(Imm))
33627 return X86::SUB64ri8;
33628 return X86::SUB64ri32;
33629 } else {
33630 if (isInt<8>(Imm))
33631 return X86::SUB32ri8;
33632 return X86::SUB32ri;
33633 }
33634}
33635
33636MachineBasicBlock *
33637X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
33638 MachineBasicBlock *MBB) const {
33639 MachineFunction *MF = MBB->getParent();
33640 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33641 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
33642 const DebugLoc &DL = MI.getDebugLoc();
33643 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
33644
33645 const unsigned ProbeSize = getStackProbeSize(*MF);
33646
33647 MachineRegisterInfo &MRI = MF->getRegInfo();
33648 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33649 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33650 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33651
33652 MachineFunction::iterator MBBIter = ++MBB->getIterator();
33653 MF->insert(MBBIter, testMBB);
33654 MF->insert(MBBIter, blockMBB);
33655 MF->insert(MBBIter, tailMBB);
33656
33657 Register sizeVReg = MI.getOperand(1).getReg();
33658
33659 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
33660
33661 Register TmpStackPtr = MRI.createVirtualRegister(
33662 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33663 Register FinalStackPtr = MRI.createVirtualRegister(
33664 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
33665
33666 BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
33667 .addReg(physSPReg);
33668 {
33669 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
33670 BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
33671 .addReg(TmpStackPtr)
33672 .addReg(sizeVReg);
33673 }
33674
33675 // test rsp size
33676
33677 BuildMI(testMBB, DL,
33678 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
33679 .addReg(FinalStackPtr)
33680 .addReg(physSPReg);
33681
33682 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
33683 .addMBB(tailMBB)
33684 .addImm(X86::COND_GE);
33685 testMBB->addSuccessor(blockMBB);
33686 testMBB->addSuccessor(tailMBB);
33687
33688 // Touch the block then extend it. This is done on the opposite side of
33689 // static probe where we allocate then touch, to avoid the need of probing the
33690 // tail of the static alloca. Possible scenarios are:
33691 //
33692 // + ---- <- ------------ <- ------------- <- ------------ +
33693 // | |
33694 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
33695 // | |
33696 // + <- ----------- <- ------------ <- ----------- <- ------------ +
33697 //
33698 // The property we want to enforce is to never have more than [page alloc] between two probes.
33699
33700 const unsigned XORMIOpc =
33701 TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;
33702 addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)
33703 .addImm(0);
33704
33705 BuildMI(blockMBB, DL,
33706 TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
33707 .addReg(physSPReg)
33708 .addImm(ProbeSize);
33709
33710
33711 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
33712 blockMBB->addSuccessor(testMBB);
33713
33714 // Replace original instruction by the expected stack ptr
33715 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
33716 .addReg(FinalStackPtr);
33717
33718 tailMBB->splice(tailMBB->end(), MBB,
33719 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
33720 tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
33721 MBB->addSuccessor(testMBB);
33722
33723 // Delete the original pseudo instruction.
33724 MI.eraseFromParent();
33725
33726 // And we're done.
33727 return tailMBB;
33728}
33729
33730MachineBasicBlock *
33731X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
33732 MachineBasicBlock *BB) const {
33733 MachineFunction *MF = BB->getParent();
33734 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
33735 const DebugLoc &DL = MI.getDebugLoc();
33736 const BasicBlock *LLVM_BB = BB->getBasicBlock();
33737
33738 assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33738, __extension__ __PRETTY_FUNCTION__))
;
33739
33740 const bool Is64Bit = Subtarget.is64Bit();
33741 const bool IsLP64 = Subtarget.isTarget64BitLP64();
33742
33743 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
33744 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
33745
33746 // BB:
33747 // ... [Till the alloca]
33748 // If stacklet is not large enough, jump to mallocMBB
33749 //
33750 // bumpMBB:
33751 // Allocate by subtracting from RSP
33752 // Jump to continueMBB
33753 //
33754 // mallocMBB:
33755 // Allocate by call to runtime
33756 //
33757 // continueMBB:
33758 // ...
33759 // [rest of original BB]
33760 //
33761
33762 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33763 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33764 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
33765
33766 MachineRegisterInfo &MRI = MF->getRegInfo();
33767 const TargetRegisterClass *AddrRegClass =
33768 getRegClassFor(getPointerTy(MF->getDataLayout()));
33769
33770 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33771 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
33772 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
33773 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
33774 sizeVReg = MI.getOperand(1).getReg(),
33775 physSPReg =
33776 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
33777
33778 MachineFunction::iterator MBBIter = ++BB->getIterator();
33779
33780 MF->insert(MBBIter, bumpMBB);
33781 MF->insert(MBBIter, mallocMBB);
33782 MF->insert(MBBIter, continueMBB);
33783
33784 continueMBB->splice(continueMBB->begin(), BB,
33785 std::next(MachineBasicBlock::iterator(MI)), BB->end());
33786 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
33787
33788 // Add code to the main basic block to check if the stack limit has been hit,
33789 // and if so, jump to mallocMBB otherwise to bumpMBB.
33790 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
33791 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
33792 .addReg(tmpSPVReg).addReg(sizeVReg);
33793 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
33794 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
33795 .addReg(SPLimitVReg);
33796 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
33797
33798 // bumpMBB simply decreases the stack pointer, since we know the current
33799 // stacklet has enough space.
33800 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
33801 .addReg(SPLimitVReg);
33802 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
33803 .addReg(SPLimitVReg);
33804 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33805
33806 // Calls into a routine in libgcc to allocate more space from the heap.
33807 const uint32_t *RegMask =
33808 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
33809 if (IsLP64) {
33810 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
33811 .addReg(sizeVReg);
33812 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33813 .addExternalSymbol("__morestack_allocate_stack_space")
33814 .addRegMask(RegMask)
33815 .addReg(X86::RDI, RegState::Implicit)
33816 .addReg(X86::RAX, RegState::ImplicitDefine);
33817 } else if (Is64Bit) {
33818 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
33819 .addReg(sizeVReg);
33820 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
33821 .addExternalSymbol("__morestack_allocate_stack_space")
33822 .addRegMask(RegMask)
33823 .addReg(X86::EDI, RegState::Implicit)
33824 .addReg(X86::EAX, RegState::ImplicitDefine);
33825 } else {
33826 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
33827 .addImm(12);
33828 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
33829 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
33830 .addExternalSymbol("__morestack_allocate_stack_space")
33831 .addRegMask(RegMask)
33832 .addReg(X86::EAX, RegState::ImplicitDefine);
33833 }
33834
33835 if (!Is64Bit)
33836 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
33837 .addImm(16);
33838
33839 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
33840 .addReg(IsLP64 ? X86::RAX : X86::EAX);
33841 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
33842
33843 // Set up the CFG correctly.
33844 BB->addSuccessor(bumpMBB);
33845 BB->addSuccessor(mallocMBB);
33846 mallocMBB->addSuccessor(continueMBB);
33847 bumpMBB->addSuccessor(continueMBB);
33848
33849 // Take care of the PHI nodes.
33850 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
33851 MI.getOperand(0).getReg())
33852 .addReg(mallocPtrVReg)
33853 .addMBB(mallocMBB)
33854 .addReg(bumpSPPtrVReg)
33855 .addMBB(bumpMBB);
33856
33857 // Delete the original pseudo instruction.
33858 MI.eraseFromParent();
33859
33860 // And we're done.
33861 return continueMBB;
33862}
33863
33864MachineBasicBlock *
33865X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
33866 MachineBasicBlock *BB) const {
33867 MachineFunction *MF = BB->getParent();
33868 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33869 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
33870 const DebugLoc &DL = MI.getDebugLoc();
33871
33872 assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33874, __extension__ __PRETTY_FUNCTION__))
33873 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33874, __extension__ __PRETTY_FUNCTION__))
33874 "SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33874, __extension__ __PRETTY_FUNCTION__))
;
33875
33876 // Only 32-bit EH needs to worry about manually restoring stack pointers.
33877 if (!Subtarget.is32Bit())
33878 return BB;
33879
33880 // C++ EH creates a new target block to hold the restore code, and wires up
33881 // the new block to the return destination with a normal JMP_4.
33882 MachineBasicBlock *RestoreMBB =
33883 MF->CreateMachineBasicBlock(BB->getBasicBlock());
33884 assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33884, __extension__ __PRETTY_FUNCTION__))
;
33885 MF->insert(std::next(BB->getIterator()), RestoreMBB);
33886 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
33887 BB->addSuccessor(RestoreMBB);
33888 MI.getOperand(0).setMBB(RestoreMBB);
33889
33890 // Marking this as an EH pad but not a funclet entry block causes PEI to
33891 // restore stack pointers in the block.
33892 RestoreMBB->setIsEHPad(true);
33893
33894 auto RestoreMBBI = RestoreMBB->begin();
33895 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
33896 return BB;
33897}
33898
33899MachineBasicBlock *
33900X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
33901 MachineBasicBlock *BB) const {
33902 // So, here we replace TLSADDR with the sequence:
33903 // adjust_stackdown -> TLSADDR -> adjust_stackup.
33904 // We need this because TLSADDR is lowered into calls
33905 // inside MC, therefore without the two markers shrink-wrapping
33906 // may push the prologue/epilogue pass them.
33907 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
33908 const DebugLoc &DL = MI.getDebugLoc();
33909 MachineFunction &MF = *BB->getParent();
33910
33911 // Emit CALLSEQ_START right before the instruction.
33912 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
33913 MachineInstrBuilder CallseqStart =
33914 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
33915 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
33916
33917 // Emit CALLSEQ_END right after the instruction.
33918 // We don't call erase from parent because we want to keep the
33919 // original instruction around.
33920 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
33921 MachineInstrBuilder CallseqEnd =
33922 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
33923 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
33924
33925 return BB;
33926}
33927
33928MachineBasicBlock *
33929X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
33930 MachineBasicBlock *BB) const {
33931 // This is pretty easy. We're taking the value that we received from
33932 // our load from the relocation, sticking it in either RDI (x86-64)
33933 // or EAX and doing an indirect call. The return value will then
33934 // be in the normal return register.
33935 MachineFunction *F = BB->getParent();
33936 const X86InstrInfo *TII = Subtarget.getInstrInfo();
33937 const DebugLoc &DL = MI.getDebugLoc();
33938
33939 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33939, __extension__ __PRETTY_FUNCTION__))
;
33940 assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33940, __extension__ __PRETTY_FUNCTION__))
;
33941
33942 // Get a register mask for the lowered call.
33943 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
33944 // proper register mask.
33945 const uint32_t *RegMask =
33946 Subtarget.is64Bit() ?
33947 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
33948 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
33949 if (Subtarget.is64Bit()) {
33950 MachineInstrBuilder MIB =
33951 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
33952 .addReg(X86::RIP)
33953 .addImm(0)
33954 .addReg(0)
33955 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33956 MI.getOperand(3).getTargetFlags())
33957 .addReg(0);
33958 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
33959 addDirectMem(MIB, X86::RDI);
33960 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
33961 } else if (!isPositionIndependent()) {
33962 MachineInstrBuilder MIB =
33963 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33964 .addReg(0)
33965 .addImm(0)
33966 .addReg(0)
33967 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33968 MI.getOperand(3).getTargetFlags())
33969 .addReg(0);
33970 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33971 addDirectMem(MIB, X86::EAX);
33972 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33973 } else {
33974 MachineInstrBuilder MIB =
33975 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
33976 .addReg(TII->getGlobalBaseReg(F))
33977 .addImm(0)
33978 .addReg(0)
33979 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
33980 MI.getOperand(3).getTargetFlags())
33981 .addReg(0);
33982 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
33983 addDirectMem(MIB, X86::EAX);
33984 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
33985 }
33986
33987 MI.eraseFromParent(); // The pseudo instruction is gone now.
33988 return BB;
33989}
33990
33991static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
33992 switch (RPOpc) {
33993 case X86::INDIRECT_THUNK_CALL32:
33994 return X86::CALLpcrel32;
33995 case X86::INDIRECT_THUNK_CALL64:
33996 return X86::CALL64pcrel32;
33997 case X86::INDIRECT_THUNK_TCRETURN32:
33998 return X86::TCRETURNdi;
33999 case X86::INDIRECT_THUNK_TCRETURN64:
34000 return X86::TCRETURNdi64;
34001 }
34002 llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34002)
;
34003}
34004
34005static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
34006 unsigned Reg) {
34007 if (Subtarget.useRetpolineExternalThunk()) {
34008 // When using an external thunk for retpolines, we pick names that match the
34009 // names GCC happens to use as well. This helps simplify the implementation
34010 // of the thunks for kernels where they have no easy ability to create
34011 // aliases and are doing non-trivial configuration of the thunk's body. For
34012 // example, the Linux kernel will do boot-time hot patching of the thunk
34013 // bodies and cannot easily export aliases of these to loaded modules.
34014 //
34015 // Note that at any point in the future, we may need to change the semantics
34016 // of how we implement retpolines and at that time will likely change the
34017 // name of the called thunk. Essentially, there is no hard guarantee that
34018 // LLVM will generate calls to specific thunks, we merely make a best-effort
34019 // attempt to help out kernels and other systems where duplicating the
34020 // thunks is costly.
34021 switch (Reg) {
34022 case X86::EAX:
34023 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34023, __extension__ __PRETTY_FUNCTION__))
;
34024 return "__x86_indirect_thunk_eax";
34025 case X86::ECX:
34026 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34026, __extension__ __PRETTY_FUNCTION__))
;
34027 return "__x86_indirect_thunk_ecx";
34028 case X86::EDX:
34029 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34029, __extension__ __PRETTY_FUNCTION__))
;
34030 return "__x86_indirect_thunk_edx";
34031 case X86::EDI:
34032 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34032, __extension__ __PRETTY_FUNCTION__))
;
34033 return "__x86_indirect_thunk_edi";
34034 case X86::R11:
34035 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34035, __extension__ __PRETTY_FUNCTION__))
;
34036 return "__x86_indirect_thunk_r11";
34037 }
34038 llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34038)
;
34039 }
34040
34041 if (Subtarget.useRetpolineIndirectCalls() ||
34042 Subtarget.useRetpolineIndirectBranches()) {
34043 // When targeting an internal COMDAT thunk use an LLVM-specific name.
34044 switch (Reg) {
34045 case X86::EAX:
34046 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34046, __extension__ __PRETTY_FUNCTION__))
;
34047 return "__llvm_retpoline_eax";
34048 case X86::ECX:
34049 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34049, __extension__ __PRETTY_FUNCTION__))
;
34050 return "__llvm_retpoline_ecx";
34051 case X86::EDX:
34052 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34052, __extension__ __PRETTY_FUNCTION__))
;
34053 return "__llvm_retpoline_edx";
34054 case X86::EDI:
34055 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34055, __extension__ __PRETTY_FUNCTION__))
;
34056 return "__llvm_retpoline_edi";
34057 case X86::R11:
34058 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34058, __extension__ __PRETTY_FUNCTION__))
;
34059 return "__llvm_retpoline_r11";
34060 }
34061 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34061)
;
34062 }
34063
34064 if (Subtarget.useLVIControlFlowIntegrity()) {
34065 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34065, __extension__ __PRETTY_FUNCTION__))
;
34066 return "__llvm_lvi_thunk_r11";
34067 }
34068 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34068)
;
34069}
34070
34071MachineBasicBlock *
34072X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
34073 MachineBasicBlock *BB) const {
34074 // Copy the virtual register into the R11 physical register and
34075 // call the retpoline thunk.
34076 const DebugLoc &DL = MI.getDebugLoc();
34077 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34078 Register CalleeVReg = MI.getOperand(0).getReg();
34079 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
34080
34081 // Find an available scratch register to hold the callee. On 64-bit, we can
34082 // just use R11, but we scan for uses anyway to ensure we don't generate
34083 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
34084 // already a register use operand to the call to hold the callee. If none
34085 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
34086 // register and ESI is the base pointer to realigned stack frames with VLAs.
34087 SmallVector<unsigned, 3> AvailableRegs;
34088 if (Subtarget.is64Bit())
34089 AvailableRegs.push_back(X86::R11);
34090 else
34091 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
34092
34093 // Zero out any registers that are already used.
34094 for (const auto &MO : MI.operands()) {
34095 if (MO.isReg() && MO.isUse())
34096 for (unsigned &Reg : AvailableRegs)
34097 if (Reg == MO.getReg())
34098 Reg = 0;
34099 }
34100
34101 // Choose the first remaining non-zero available register.
34102 unsigned AvailableReg = 0;
34103 for (unsigned MaybeReg : AvailableRegs) {
34104 if (MaybeReg) {
34105 AvailableReg = MaybeReg;
34106 break;
34107 }
34108 }
34109 if (!AvailableReg)
34110 report_fatal_error("calling convention incompatible with retpoline, no "
34111 "available registers");
34112
34113 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
34114
34115 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
34116 .addReg(CalleeVReg);
34117 MI.getOperand(0).ChangeToES(Symbol);
34118 MI.setDesc(TII->get(Opc));
34119 MachineInstrBuilder(*BB->getParent(), &MI)
34120 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
34121 return BB;
34122}
34123
34124/// SetJmp implies future control flow change upon calling the corresponding
34125/// LongJmp.
34126/// Instead of using the 'return' instruction, the long jump fixes the stack and
34127/// performs an indirect branch. To do so it uses the registers that were stored
34128/// in the jump buffer (when calling SetJmp).
34129/// In case the shadow stack is enabled we need to fix it as well, because some
34130/// return addresses will be skipped.
34131/// The function will save the SSP for future fixing in the function
34132/// emitLongJmpShadowStackFix.
34133/// \sa emitLongJmpShadowStackFix
34134/// \param [in] MI The temporary Machine Instruction for the builtin.
34135/// \param [in] MBB The Machine Basic Block that will be modified.
34136void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
34137 MachineBasicBlock *MBB) const {
34138 const DebugLoc &DL = MI.getDebugLoc();
34139 MachineFunction *MF = MBB->getParent();
34140 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34141 MachineRegisterInfo &MRI = MF->getRegInfo();
34142 MachineInstrBuilder MIB;
34143
34144 // Memory Reference.
34145 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34146 MI.memoperands_end());
34147
34148 // Initialize a register with zero.
34149 MVT PVT = getPointerTy(MF->getDataLayout());
34150 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
34151 Register ZReg = MRI.createVirtualRegister(PtrRC);
34152 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
34153 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
34154 .addDef(ZReg)
34155 .addReg(ZReg, RegState::Undef)
34156 .addReg(ZReg, RegState::Undef);
34157
34158 // Read the current SSP Register value to the zeroed register.
34159 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
34160 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
34161 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
34162
34163 // Write the SSP register value to offset 3 in input memory buffer.
34164 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
34165 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
34166 const int64_t SSPOffset = 3 * PVT.getStoreSize();
34167 const unsigned MemOpndSlot = 1;
34168 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34169 if (i == X86::AddrDisp)
34170 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
34171 else
34172 MIB.add(MI.getOperand(MemOpndSlot + i));
34173 }
34174 MIB.addReg(SSPCopyReg);
34175 MIB.setMemRefs(MMOs);
34176}
34177
34178MachineBasicBlock *
34179X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
34180 MachineBasicBlock *MBB) const {
34181 const DebugLoc &DL = MI.getDebugLoc();
34182 MachineFunction *MF = MBB->getParent();
34183 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34184 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34185 MachineRegisterInfo &MRI = MF->getRegInfo();
34186
34187 const BasicBlock *BB = MBB->getBasicBlock();
34188 MachineFunction::iterator I = ++MBB->getIterator();
34189
34190 // Memory Reference
34191 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34192 MI.memoperands_end());
34193
34194 unsigned DstReg;
34195 unsigned MemOpndSlot = 0;
34196
34197 unsigned CurOp = 0;
34198
34199 DstReg = MI.getOperand(CurOp++).getReg();
34200 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
34201 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34201, __extension__ __PRETTY_FUNCTION__))
;
34202 (void)TRI;
34203 Register mainDstReg = MRI.createVirtualRegister(RC);
34204 Register restoreDstReg = MRI.createVirtualRegister(RC);
34205
34206 MemOpndSlot = CurOp;
34207
34208 MVT PVT = getPointerTy(MF->getDataLayout());
34209 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34210, __extension__ __PRETTY_FUNCTION__))
34210 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34210, __extension__ __PRETTY_FUNCTION__))
;
34211
34212 // For v = setjmp(buf), we generate
34213 //
34214 // thisMBB:
34215 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
34216 // SjLjSetup restoreMBB
34217 //
34218 // mainMBB:
34219 // v_main = 0
34220 //
34221 // sinkMBB:
34222 // v = phi(main, restore)
34223 //
34224 // restoreMBB:
34225 // if base pointer being used, load it from frame
34226 // v_restore = 1
34227
34228 MachineBasicBlock *thisMBB = MBB;
34229 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
34230 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34231 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
34232 MF->insert(I, mainMBB);
34233 MF->insert(I, sinkMBB);
34234 MF->push_back(restoreMBB);
34235 restoreMBB->setHasAddressTaken();
34236
34237 MachineInstrBuilder MIB;
34238
34239 // Transfer the remainder of BB and its successor edges to sinkMBB.
34240 sinkMBB->splice(sinkMBB->begin(), MBB,
34241 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34242 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
34243
34244 // thisMBB:
34245 unsigned PtrStoreOpc = 0;
34246 unsigned LabelReg = 0;
34247 const int64_t LabelOffset = 1 * PVT.getStoreSize();
34248 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
34249 !isPositionIndependent();
34250
34251 // Prepare IP either in reg or imm.
34252 if (!UseImmLabel) {
34253 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
34254 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
34255 LabelReg = MRI.createVirtualRegister(PtrRC);
34256 if (Subtarget.is64Bit()) {
34257 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
34258 .addReg(X86::RIP)
34259 .addImm(0)
34260 .addReg(0)
34261 .addMBB(restoreMBB)
34262 .addReg(0);
34263 } else {
34264 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
34265 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
34266 .addReg(XII->getGlobalBaseReg(MF))
34267 .addImm(0)
34268 .addReg(0)
34269 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
34270 .addReg(0);
34271 }
34272 } else
34273 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
34274 // Store IP
34275 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
34276 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34277 if (i == X86::AddrDisp)
34278 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
34279 else
34280 MIB.add(MI.getOperand(MemOpndSlot + i));
34281 }
34282 if (!UseImmLabel)
34283 MIB.addReg(LabelReg);
34284 else
34285 MIB.addMBB(restoreMBB);
34286 MIB.setMemRefs(MMOs);
34287
34288 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
34289 emitSetJmpShadowStackFix(MI, thisMBB);
34290 }
34291
34292 // Setup
34293 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
34294 .addMBB(restoreMBB);
34295
34296 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
34297 MIB.addRegMask(RegInfo->getNoPreservedMask());
34298 thisMBB->addSuccessor(mainMBB);
34299 thisMBB->addSuccessor(restoreMBB);
34300
34301 // mainMBB:
34302 // EAX = 0
34303 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
34304 mainMBB->addSuccessor(sinkMBB);
34305
34306 // sinkMBB:
34307 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
34308 TII->get(X86::PHI), DstReg)
34309 .addReg(mainDstReg).addMBB(mainMBB)
34310 .addReg(restoreDstReg).addMBB(restoreMBB);
34311
34312 // restoreMBB:
34313 if (RegInfo->hasBasePointer(*MF)) {
34314 const bool Uses64BitFramePtr =
34315 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
34316 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
34317 X86FI->setRestoreBasePointer(MF);
34318 Register FramePtr = RegInfo->getFrameRegister(*MF);
34319 Register BasePtr = RegInfo->getBaseRegister();
34320 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
34321 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
34322 FramePtr, true, X86FI->getRestoreBasePointerOffset())
34323 .setMIFlag(MachineInstr::FrameSetup);
34324 }
34325 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
34326 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
34327 restoreMBB->addSuccessor(sinkMBB);
34328
34329 MI.eraseFromParent();
34330 return sinkMBB;
34331}
34332
34333/// Fix the shadow stack using the previously saved SSP pointer.
34334/// \sa emitSetJmpShadowStackFix
34335/// \param [in] MI The temporary Machine Instruction for the builtin.
34336/// \param [in] MBB The Machine Basic Block that will be modified.
34337/// \return The sink MBB that will perform the future indirect branch.
34338MachineBasicBlock *
34339X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
34340 MachineBasicBlock *MBB) const {
34341 const DebugLoc &DL = MI.getDebugLoc();
34342 MachineFunction *MF = MBB->getParent();
34343 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34344 MachineRegisterInfo &MRI = MF->getRegInfo();
34345
34346 // Memory Reference
34347 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34348 MI.memoperands_end());
34349
34350 MVT PVT = getPointerTy(MF->getDataLayout());
34351 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
34352
34353 // checkSspMBB:
34354 // xor vreg1, vreg1
34355 // rdssp vreg1
34356 // test vreg1, vreg1
34357 // je sinkMBB # Jump if Shadow Stack is not supported
34358 // fallMBB:
34359 // mov buf+24/12(%rip), vreg2
34360 // sub vreg1, vreg2
34361 // jbe sinkMBB # No need to fix the Shadow Stack
34362 // fixShadowMBB:
34363 // shr 3/2, vreg2
34364 // incssp vreg2 # fix the SSP according to the lower 8 bits
34365 // shr 8, vreg2
34366 // je sinkMBB
34367 // fixShadowLoopPrepareMBB:
34368 // shl vreg2
34369 // mov 128, vreg3
34370 // fixShadowLoopMBB:
34371 // incssp vreg3
34372 // dec vreg2
34373 // jne fixShadowLoopMBB # Iterate until you finish fixing
34374 // # the Shadow Stack
34375 // sinkMBB:
34376
34377 MachineFunction::iterator I = ++MBB->getIterator();
34378 const BasicBlock *BB = MBB->getBasicBlock();
34379
34380 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
34381 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
34382 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
34383 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
34384 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
34385 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34386 MF->insert(I, checkSspMBB);
34387 MF->insert(I, fallMBB);
34388 MF->insert(I, fixShadowMBB);
34389 MF->insert(I, fixShadowLoopPrepareMBB);
34390 MF->insert(I, fixShadowLoopMBB);
34391 MF->insert(I, sinkMBB);
34392
34393 // Transfer the remainder of BB and its successor edges to sinkMBB.
34394 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
34395 MBB->end());
34396 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
34397
34398 MBB->addSuccessor(checkSspMBB);
34399
34400 // Initialize a register with zero.
34401 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
34402 BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
34403
34404 if (PVT == MVT::i64) {
34405 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
34406 BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
34407 .addImm(0)
34408 .addReg(ZReg)
34409 .addImm(X86::sub_32bit);
34410 ZReg = TmpZReg;
34411 }
34412
34413 // Read the current SSP Register value to the zeroed register.
34414 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
34415 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
34416 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
34417
34418 // Check whether the result of the SSP register is zero and jump directly
34419 // to the sink.
34420 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
34421 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
34422 .addReg(SSPCopyReg)
34423 .addReg(SSPCopyReg);
34424 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
34425 checkSspMBB->addSuccessor(sinkMBB);
34426 checkSspMBB->addSuccessor(fallMBB);
34427
34428 // Reload the previously saved SSP register value.
34429 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
34430 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
34431 const int64_t SPPOffset = 3 * PVT.getStoreSize();
34432 MachineInstrBuilder MIB =
34433 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
34434 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34435 const MachineOperand &MO = MI.getOperand(i);
34436 if (i == X86::AddrDisp)
34437 MIB.addDisp(MO, SPPOffset);
34438 else if (MO.isReg()) // Don't add the whole operand, we don't want to
34439 // preserve kill flags.
34440 MIB.addReg(MO.getReg());
34441 else
34442 MIB.add(MO);
34443 }
34444 MIB.setMemRefs(MMOs);
34445
34446 // Subtract the current SSP from the previous SSP.
34447 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
34448 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
34449 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
34450 .addReg(PrevSSPReg)
34451 .addReg(SSPCopyReg);
34452
34453 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
34454 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
34455 fallMBB->addSuccessor(sinkMBB);
34456 fallMBB->addSuccessor(fixShadowMBB);
34457
34458 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
34459 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
34460 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
34461 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
34462 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
34463 .addReg(SspSubReg)
34464 .addImm(Offset);
34465
34466 // Increase SSP when looking only on the lower 8 bits of the delta.
34467 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
34468 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
34469
34470 // Reset the lower 8 bits.
34471 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
34472 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
34473 .addReg(SspFirstShrReg)
34474 .addImm(8);
34475
34476 // Jump if the result of the shift is zero.
34477 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
34478 fixShadowMBB->addSuccessor(sinkMBB);
34479 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
34480
34481 // Do a single shift left.
34482 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
34483 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
34484 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
34485 .addReg(SspSecondShrReg);
34486
34487 // Save the value 128 to a register (will be used next with incssp).
34488 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
34489 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
34490 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
34491 .addImm(128);
34492 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
34493
34494 // Since incssp only looks at the lower 8 bits, we might need to do several
34495 // iterations of incssp until we finish fixing the shadow stack.
34496 Register DecReg = MRI.createVirtualRegister(PtrRC);
34497 Register CounterReg = MRI.createVirtualRegister(PtrRC);
34498 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
34499 .addReg(SspAfterShlReg)
34500 .addMBB(fixShadowLoopPrepareMBB)
34501 .addReg(DecReg)
34502 .addMBB(fixShadowLoopMBB);
34503
34504 // Every iteration we increase the SSP by 128.
34505 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
34506
34507 // Every iteration we decrement the counter by 1.
34508 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
34509 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
34510
34511 // Jump if the counter is not zero yet.
34512 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
34513 fixShadowLoopMBB->addSuccessor(sinkMBB);
34514 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
34515
34516 return sinkMBB;
34517}
34518
34519MachineBasicBlock *
34520X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
34521 MachineBasicBlock *MBB) const {
34522 const DebugLoc &DL = MI.getDebugLoc();
34523 MachineFunction *MF = MBB->getParent();
34524 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34525 MachineRegisterInfo &MRI = MF->getRegInfo();
34526
34527 // Memory Reference
34528 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
34529 MI.memoperands_end());
34530
34531 MVT PVT = getPointerTy(MF->getDataLayout());
34532 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34533, __extension__ __PRETTY_FUNCTION__))
34533 "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34533, __extension__ __PRETTY_FUNCTION__))
;
34534
34535 const TargetRegisterClass *RC =
34536 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
34537 Register Tmp = MRI.createVirtualRegister(RC);
34538 // Since FP is only updated here but NOT referenced, it's treated as GPR.
34539 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
34540 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
34541 Register SP = RegInfo->getStackRegister();
34542
34543 MachineInstrBuilder MIB;
34544
34545 const int64_t LabelOffset = 1 * PVT.getStoreSize();
34546 const int64_t SPOffset = 2 * PVT.getStoreSize();
34547
34548 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
34549 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
34550
34551 MachineBasicBlock *thisMBB = MBB;
34552
34553 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
34554 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
34555 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
34556 }
34557
34558 // Reload FP
34559 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
34560 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34561 const MachineOperand &MO = MI.getOperand(i);
34562 if (MO.isReg()) // Don't add the whole operand, we don't want to
34563 // preserve kill flags.
34564 MIB.addReg(MO.getReg());
34565 else
34566 MIB.add(MO);
34567 }
34568 MIB.setMemRefs(MMOs);
34569
34570 // Reload IP
34571 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
34572 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34573 const MachineOperand &MO = MI.getOperand(i);
34574 if (i == X86::AddrDisp)
34575 MIB.addDisp(MO, LabelOffset);
34576 else if (MO.isReg()) // Don't add the whole operand, we don't want to
34577 // preserve kill flags.
34578 MIB.addReg(MO.getReg());
34579 else
34580 MIB.add(MO);
34581 }
34582 MIB.setMemRefs(MMOs);
34583
34584 // Reload SP
34585 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
34586 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
34587 if (i == X86::AddrDisp)
34588 MIB.addDisp(MI.getOperand(i), SPOffset);
34589 else
34590 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
34591 // the last instruction of the expansion.
34592 }
34593 MIB.setMemRefs(MMOs);
34594
34595 // Jump
34596 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
34597
34598 MI.eraseFromParent();
34599 return thisMBB;
34600}
34601
34602void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
34603 MachineBasicBlock *MBB,
34604 MachineBasicBlock *DispatchBB,
34605 int FI) const {
34606 const DebugLoc &DL = MI.getDebugLoc();
34607 MachineFunction *MF = MBB->getParent();
34608 MachineRegisterInfo *MRI = &MF->getRegInfo();
34609 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34610
34611 MVT PVT = getPointerTy(MF->getDataLayout());
34612 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34612, __extension__ __PRETTY_FUNCTION__))
;
34613
34614 unsigned Op = 0;
34615 unsigned VR = 0;
34616
34617 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
34618 !isPositionIndependent();
34619
34620 if (UseImmLabel) {
34621 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
34622 } else {
34623 const TargetRegisterClass *TRC =
34624 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
34625 VR = MRI->createVirtualRegister(TRC);
34626 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
34627
34628 if (Subtarget.is64Bit())
34629 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
34630 .addReg(X86::RIP)
34631 .addImm(1)
34632 .addReg(0)
34633 .addMBB(DispatchBB)
34634 .addReg(0);
34635 else
34636 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
34637 .addReg(0) /* TII->getGlobalBaseReg(MF) */
34638 .addImm(1)
34639 .addReg(0)
34640 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
34641 .addReg(0);
34642 }
34643
34644 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
34645 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
34646 if (UseImmLabel)
34647 MIB.addMBB(DispatchBB);
34648 else
34649 MIB.addReg(VR);
34650}
34651
34652MachineBasicBlock *
34653X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
34654 MachineBasicBlock *BB) const {
34655 const DebugLoc &DL = MI.getDebugLoc();
34656 MachineFunction *MF = BB->getParent();
34657 MachineRegisterInfo *MRI = &MF->getRegInfo();
34658 const X86InstrInfo *TII = Subtarget.getInstrInfo();
34659 int FI = MF->getFrameInfo().getFunctionContextIndex();
34660
34661 // Get a mapping of the call site numbers to all of the landing pads they're
34662 // associated with.
34663 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
34664 unsigned MaxCSNum = 0;
34665 for (auto &MBB : *MF) {
34666 if (!MBB.isEHPad())
34667 continue;
34668
34669 MCSymbol *Sym = nullptr;
34670 for (const auto &MI : MBB) {
34671 if (MI.isDebugInstr())
34672 continue;
34673
34674 assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34674, __extension__ __PRETTY_FUNCTION__))
;
34675 Sym = MI.getOperand(0).getMCSymbol();
34676 break;
34677 }
34678
34679 if (!MF->hasCallSiteLandingPad(Sym))
34680 continue;
34681
34682 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
34683 CallSiteNumToLPad[CSI].push_back(&MBB);
34684 MaxCSNum = std::max(MaxCSNum, CSI);
34685 }
34686 }
34687
34688 // Get an ordered list of the machine basic blocks for the jump table.
34689 std::vector<MachineBasicBlock *> LPadList;
34690 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
34691 LPadList.reserve(CallSiteNumToLPad.size());
34692
34693 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
34694 for (auto &LP : CallSiteNumToLPad[CSI]) {
34695 LPadList.push_back(LP);
34696 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
34697 }
34698 }
34699
34700 assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34701, __extension__ __PRETTY_FUNCTION__))
34701 "No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34701, __extension__ __PRETTY_FUNCTION__))
;
34702
34703 // Create the MBBs for the dispatch code.
34704
34705 // Shove the dispatch's address into the return slot in the function context.
34706 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
34707 DispatchBB->setIsEHPad(true);
34708
34709 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
34710 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
34711 DispatchBB->addSuccessor(TrapBB);
34712
34713 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
34714 DispatchBB->addSuccessor(DispContBB);
34715
34716 // Insert MBBs.
34717 MF->push_back(DispatchBB);
34718 MF->push_back(DispContBB);
34719 MF->push_back(TrapBB);
34720
34721 // Insert code into the entry block that creates and registers the function
34722 // context.
34723 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
34724
34725 // Create the jump table and associated information
34726 unsigned JTE = getJumpTableEncoding();
34727 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
34728 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
34729
34730 const X86RegisterInfo &RI = TII->getRegisterInfo();
34731 // Add a register mask with no preserved registers. This results in all
34732 // registers being marked as clobbered.
34733 if (RI.hasBasePointer(*MF)) {
34734 const bool FPIs64Bit =
34735 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
34736 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
34737 MFI->setRestoreBasePointer(MF);
34738
34739 Register FP = RI.getFrameRegister(*MF);
34740 Register BP = RI.getBaseRegister();
34741 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
34742 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
34743 MFI->getRestoreBasePointerOffset())
34744 .addRegMask(RI.getNoPreservedMask());
34745 } else {
34746 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
34747 .addRegMask(RI.getNoPreservedMask());
34748 }
34749
34750 // IReg is used as an index in a memory operand and therefore can't be SP
34751 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
34752 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
34753 Subtarget.is64Bit() ? 8 : 4);
34754 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
34755 .addReg(IReg)
34756 .addImm(LPadList.size());
34757 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
34758
34759 if (Subtarget.is64Bit()) {
34760 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34761 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
34762
34763 // leaq .LJTI0_0(%rip), BReg
34764 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
34765 .addReg(X86::RIP)
34766 .addImm(1)
34767 .addReg(0)
34768 .addJumpTableIndex(MJTI)
34769 .addReg(0);
34770 // movzx IReg64, IReg
34771 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
34772 .addImm(0)
34773 .addReg(IReg)
34774 .addImm(X86::sub_32bit);
34775
34776 switch (JTE) {
34777 case MachineJumpTableInfo::EK_BlockAddress:
34778 // jmpq *(BReg,IReg64,8)
34779 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
34780 .addReg(BReg)
34781 .addImm(8)
34782 .addReg(IReg64)
34783 .addImm(0)
34784 .addReg(0);
34785 break;
34786 case MachineJumpTableInfo::EK_LabelDifference32: {
34787 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
34788 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
34789 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
34790
34791 // movl (BReg,IReg64,4), OReg
34792 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
34793 .addReg(BReg)
34794 .addImm(4)
34795 .addReg(IReg64)
34796 .addImm(0)
34797 .addReg(0);
34798 // movsx OReg64, OReg
34799 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
34800 // addq BReg, OReg64, TReg
34801 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
34802 .addReg(OReg64)
34803 .addReg(BReg);
34804 // jmpq *TReg
34805 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
34806 break;
34807 }
34808 default:
34809 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34809)
;
34810 }
34811 } else {
34812 // jmpl *.LJTI0_0(,IReg,4)
34813 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
34814 .addReg(0)
34815 .addImm(4)
34816 .addReg(IReg)
34817 .addJumpTableIndex(MJTI)
34818 .addReg(0);
34819 }
34820
34821 // Add the jump table entries as successors to the MBB.
34822 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
34823 for (auto &LP : LPadList)
34824 if (SeenMBBs.insert(LP).second)
34825 DispContBB->addSuccessor(LP);
34826
34827 // N.B. the order the invoke BBs are processed in doesn't matter here.
34828 SmallVector<MachineBasicBlock *, 64> MBBLPads;
34829 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
34830 for (MachineBasicBlock *MBB : InvokeBBs) {
34831 // Remove the landing pad successor from the invoke block and replace it
34832 // with the new dispatch block.
34833 // Keep a copy of Successors since it's modified inside the loop.
34834 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
34835 MBB->succ_rend());
34836 // FIXME: Avoid quadratic complexity.
34837 for (auto MBBS : Successors) {
34838 if (MBBS->isEHPad()) {
34839 MBB->removeSuccessor(MBBS);
34840 MBBLPads.push_back(MBBS);
34841 }
34842 }
34843
34844 MBB->addSuccessor(DispatchBB);
34845
34846 // Find the invoke call and mark all of the callee-saved registers as
34847 // 'implicit defined' so that they're spilled. This prevents code from
34848 // moving instructions to before the EH block, where they will never be
34849 // executed.
34850 for (auto &II : reverse(*MBB)) {
34851 if (!II.isCall())
34852 continue;
34853
34854 DenseMap<unsigned, bool> DefRegs;
34855 for (auto &MOp : II.operands())
34856 if (MOp.isReg())
34857 DefRegs[MOp.getReg()] = true;
34858
34859 MachineInstrBuilder MIB(*MF, &II);
34860 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
34861 unsigned Reg = SavedRegs[RegIdx];
34862 if (!DefRegs[Reg])
34863 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
34864 }
34865
34866 break;
34867 }
34868 }
34869
34870 // Mark all former landing pads as non-landing pads. The dispatch is the only
34871 // landing pad now.
34872 for (auto &LP : MBBLPads)
34873 LP->setIsEHPad(false);
34874
34875 // The instruction is gone now.
34876 MI.eraseFromParent();
34877 return BB;
34878}
34879
34880MachineBasicBlock *
34881X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
34882 MachineBasicBlock *BB) const {
34883 MachineFunction *MF = BB->getParent();
34884 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34885 const DebugLoc &DL = MI.getDebugLoc();
34886
34887 auto TMMImmToTMMReg = [](unsigned Imm) {
34888 assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34888, __extension__ __PRETTY_FUNCTION__))
;
34889 return X86::TMM0 + Imm;
34890 };
34891 switch (MI.getOpcode()) {
34892 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34892)
;
34893 case X86::TLS_addr32:
34894 case X86::TLS_addr64:
34895 case X86::TLS_addrX32:
34896 case X86::TLS_base_addr32:
34897 case X86::TLS_base_addr64:
34898 case X86::TLS_base_addrX32:
34899 return EmitLoweredTLSAddr(MI, BB);
34900 case X86::INDIRECT_THUNK_CALL32:
34901 case X86::INDIRECT_THUNK_CALL64:
34902 case X86::INDIRECT_THUNK_TCRETURN32:
34903 case X86::INDIRECT_THUNK_TCRETURN64:
34904 return EmitLoweredIndirectThunk(MI, BB);
34905 case X86::CATCHRET:
34906 return EmitLoweredCatchRet(MI, BB);
34907 case X86::SEG_ALLOCA_32:
34908 case X86::SEG_ALLOCA_64:
34909 return EmitLoweredSegAlloca(MI, BB);
34910 case X86::PROBED_ALLOCA_32:
34911 case X86::PROBED_ALLOCA_64:
34912 return EmitLoweredProbedAlloca(MI, BB);
34913 case X86::TLSCall_32:
34914 case X86::TLSCall_64:
34915 return EmitLoweredTLSCall(MI, BB);
34916 case X86::CMOV_FR32:
34917 case X86::CMOV_FR32X:
34918 case X86::CMOV_FR64:
34919 case X86::CMOV_FR64X:
34920 case X86::CMOV_GR8:
34921 case X86::CMOV_GR16:
34922 case X86::CMOV_GR32:
34923 case X86::CMOV_RFP32:
34924 case X86::CMOV_RFP64:
34925 case X86::CMOV_RFP80:
34926 case X86::CMOV_VR64:
34927 case X86::CMOV_VR128:
34928 case X86::CMOV_VR128X:
34929 case X86::CMOV_VR256:
34930 case X86::CMOV_VR256X:
34931 case X86::CMOV_VR512:
34932 case X86::CMOV_VK1:
34933 case X86::CMOV_VK2:
34934 case X86::CMOV_VK4:
34935 case X86::CMOV_VK8:
34936 case X86::CMOV_VK16:
34937 case X86::CMOV_VK32:
34938 case X86::CMOV_VK64:
34939 return EmitLoweredSelect(MI, BB);
34940
34941 case X86::RDFLAGS32:
34942 case X86::RDFLAGS64: {
34943 unsigned PushF =
34944 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
34945 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
34946 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
34947 // Permit reads of the EFLAGS and DF registers without them being defined.
34948 // This intrinsic exists to read external processor state in flags, such as
34949 // the trap flag, interrupt flag, and direction flag, none of which are
34950 // modeled by the backend.
34951 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34952, __extension__ __PRETTY_FUNCTION__))
34952 "Unexpected register in operand!")(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34952, __extension__ __PRETTY_FUNCTION__))
;
34953 Push->getOperand(2).setIsUndef();
34954 assert(Push->getOperand(3).getReg() == X86::DF &&(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34955, __extension__ __PRETTY_FUNCTION__))
34955 "Unexpected register in operand!")(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34955, __extension__ __PRETTY_FUNCTION__))
;
34956 Push->getOperand(3).setIsUndef();
34957 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
34958
34959 MI.eraseFromParent(); // The pseudo is gone now.
34960 return BB;
34961 }
34962
34963 case X86::WRFLAGS32:
34964 case X86::WRFLAGS64: {
34965 unsigned Push =
34966 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
34967 unsigned PopF =
34968 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
34969 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
34970 BuildMI(*BB, MI, DL, TII->get(PopF));
34971
34972 MI.eraseFromParent(); // The pseudo is gone now.
34973 return BB;
34974 }
34975
34976 case X86::FP32_TO_INT16_IN_MEM:
34977 case X86::FP32_TO_INT32_IN_MEM:
34978 case X86::FP32_TO_INT64_IN_MEM:
34979 case X86::FP64_TO_INT16_IN_MEM:
34980 case X86::FP64_TO_INT32_IN_MEM:
34981 case X86::FP64_TO_INT64_IN_MEM:
34982 case X86::FP80_TO_INT16_IN_MEM:
34983 case X86::FP80_TO_INT32_IN_MEM:
34984 case X86::FP80_TO_INT64_IN_MEM: {
34985 // Change the floating point control register to use "round towards zero"
34986 // mode when truncating to an integer value.
34987 int OrigCWFrameIdx =
34988 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
34989 addFrameReference(BuildMI(*BB, MI, DL,
34990 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
34991
34992 // Load the old value of the control word...
34993 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34994 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
34995 OrigCWFrameIdx);
34996
34997 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
34998 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
34999 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
35000 .addReg(OldCW, RegState::Kill).addImm(0xC00);
35001
35002 // Extract to 16 bits.
35003 Register NewCW16 =
35004 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
35005 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
35006 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
35007
35008 // Prepare memory for FLDCW.
35009 int NewCWFrameIdx =
35010 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
35011 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
35012 NewCWFrameIdx)
35013 .addReg(NewCW16, RegState::Kill);
35014
35015 // Reload the modified control word now...
35016 addFrameReference(BuildMI(*BB, MI, DL,
35017 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
35018
35019 // Get the X86 opcode to use.
35020 unsigned Opc;
35021 switch (MI.getOpcode()) {
35022 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35022)
;
35023 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
35024 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
35025 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
35026 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
35027 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
35028 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
35029 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
35030 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
35031 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
35032 }
35033
35034 X86AddressMode AM = getAddressFromInstr(&MI, 0);
35035 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
35036 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
35037
35038 // Reload the original control word now.
35039 addFrameReference(BuildMI(*BB, MI, DL,
35040 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
35041
35042 MI.eraseFromParent(); // The pseudo instruction is gone now.
35043 return BB;
35044 }
35045
35046 // xbegin
35047 case X86::XBEGIN:
35048 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
35049
35050 case X86::VAARG_64:
35051 case X86::VAARG_X32:
35052 return EmitVAARGWithCustomInserter(MI, BB);
35053
35054 case X86::EH_SjLj_SetJmp32:
35055 case X86::EH_SjLj_SetJmp64:
35056 return emitEHSjLjSetJmp(MI, BB);
35057
35058 case X86::EH_SjLj_LongJmp32:
35059 case X86::EH_SjLj_LongJmp64:
35060 return emitEHSjLjLongJmp(MI, BB);
35061
35062 case X86::Int_eh_sjlj_setup_dispatch:
35063 return EmitSjLjDispatchBlock(MI, BB);
35064
35065 case TargetOpcode::STATEPOINT:
35066 // As an implementation detail, STATEPOINT shares the STACKMAP format at
35067 // this point in the process. We diverge later.
35068 return emitPatchPoint(MI, BB);
35069
35070 case TargetOpcode::STACKMAP:
35071 case TargetOpcode::PATCHPOINT:
35072 return emitPatchPoint(MI, BB);
35073
35074 case TargetOpcode::PATCHABLE_EVENT_CALL:
35075 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
35076 return BB;
35077
35078 case X86::LCMPXCHG8B: {
35079 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35080 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
35081 // requires a memory operand. If it happens that current architecture is
35082 // i686 and for current function we need a base pointer
35083 // - which is ESI for i686 - register allocator would not be able to
35084 // allocate registers for an address in form of X(%reg, %reg, Y)
35085 // - there never would be enough unreserved registers during regalloc
35086 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
35087 // We are giving a hand to register allocator by precomputing the address in
35088 // a new vreg using LEA.
35089
35090 // If it is not i686 or there is no base pointer - nothing to do here.
35091 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
35092 return BB;
35093
35094 // Even though this code does not necessarily needs the base pointer to
35095 // be ESI, we check for that. The reason: if this assert fails, there are
35096 // some changes happened in the compiler base pointer handling, which most
35097 // probably have to be addressed somehow here.
35098 assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35100, __extension__ __PRETTY_FUNCTION__))
35099 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35100, __extension__ __PRETTY_FUNCTION__))
35100 "base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35100, __extension__ __PRETTY_FUNCTION__))
;
35101
35102 MachineRegisterInfo &MRI = MF->getRegInfo();
35103 MVT SPTy = getPointerTy(MF->getDataLayout());
35104 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
35105 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
35106
35107 X86AddressMode AM = getAddressFromInstr(&MI, 0);
35108 // Regalloc does not need any help when the memory operand of CMPXCHG8B
35109 // does not use index register.
35110 if (AM.IndexReg == X86::NoRegister)
35111 return BB;
35112
35113 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
35114 // four operand definitions that are E[ABCD] registers. We skip them and
35115 // then insert the LEA.
35116 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
35117 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
35118 RMBBI->definesRegister(X86::EBX) ||
35119 RMBBI->definesRegister(X86::ECX) ||
35120 RMBBI->definesRegister(X86::EDX))) {
35121 ++RMBBI;
35122 }
35123 MachineBasicBlock::iterator MBBI(RMBBI);
35124 addFullAddress(
35125 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
35126
35127 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
35128
35129 return BB;
35130 }
35131 case X86::LCMPXCHG16B_NO_RBX: {
35132 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35133 Register BasePtr = TRI->getBaseRegister();
35134 if (TRI->hasBasePointer(*MF) &&
35135 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
35136 if (!BB->isLiveIn(BasePtr))
35137 BB->addLiveIn(BasePtr);
35138 // Save RBX into a virtual register.
35139 Register SaveRBX =
35140 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35141 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
35142 .addReg(X86::RBX);
35143 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35144 MachineInstrBuilder MIB =
35145 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
35146 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
35147 MIB.add(MI.getOperand(Idx));
35148 MIB.add(MI.getOperand(X86::AddrNumOperands));
35149 MIB.addReg(SaveRBX);
35150 } else {
35151 // Simple case, just copy the virtual register to RBX.
35152 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)
35153 .add(MI.getOperand(X86::AddrNumOperands));
35154 MachineInstrBuilder MIB =
35155 BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));
35156 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
35157 MIB.add(MI.getOperand(Idx));
35158 }
35159 MI.eraseFromParent();
35160 return BB;
35161 }
35162 case X86::MWAITX: {
35163 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
35164 Register BasePtr = TRI->getBaseRegister();
35165 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
35166 // If no need to save the base pointer, we generate MWAITXrrr,
35167 // else we generate pseudo MWAITX_SAVE_RBX.
35168 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
35169 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
35170 .addReg(MI.getOperand(0).getReg());
35171 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
35172 .addReg(MI.getOperand(1).getReg());
35173 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)
35174 .addReg(MI.getOperand(2).getReg());
35175 BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));
35176 MI.eraseFromParent();
35177 } else {
35178 if (!BB->isLiveIn(BasePtr)) {
35179 BB->addLiveIn(BasePtr);
35180 }
35181 // Parameters can be copied into ECX and EAX but not EBX yet.
35182 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)
35183 .addReg(MI.getOperand(0).getReg());
35184 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)
35185 .addReg(MI.getOperand(1).getReg());
35186 assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35186, __extension__ __PRETTY_FUNCTION__))
;
35187 // Save RBX into a virtual register.
35188 Register SaveRBX =
35189 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35190 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)
35191 .addReg(X86::RBX);
35192 // Generate mwaitx pseudo.
35193 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
35194 BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))
35195 .addDef(Dst) // Destination tied in with SaveRBX.
35196 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
35197 .addUse(SaveRBX); // Save of base pointer.
35198 MI.eraseFromParent();
35199 }
35200 return BB;
35201 }
35202 case TargetOpcode::PREALLOCATED_SETUP: {
35203 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35203, __extension__ __PRETTY_FUNCTION__))
;
35204 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
35205 MFI->setHasPreallocatedCall(true);
35206 int64_t PreallocatedId = MI.getOperand(0).getImm();
35207 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
35208 assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35208, __extension__ __PRETTY_FUNCTION__))
;
35209 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
35210 << StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)
;
35211 BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
35212 .addReg(X86::ESP)
35213 .addImm(StackAdjustment);
35214 MI.eraseFromParent();
35215 return BB;
35216 }
35217 case TargetOpcode::PREALLOCATED_ARG: {
35218 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35218, __extension__ __PRETTY_FUNCTION__))
;
35219 int64_t PreallocatedId = MI.getOperand(1).getImm();
35220 int64_t ArgIdx = MI.getOperand(2).getImm();
35221 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
35222 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
35223 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
35224 << ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)
;
35225 // stack pointer + offset
35226 addRegOffset(
35227 BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
35228 X86::ESP, false, ArgOffset);
35229 MI.eraseFromParent();
35230 return BB;
35231 }
35232 case X86::PTDPBSSD:
35233 case X86::PTDPBSUD:
35234 case X86::PTDPBUSD:
35235 case X86::PTDPBUUD:
35236 case X86::PTDPBF16PS: {
35237 unsigned Opc;
35238 switch (MI.getOpcode()) {
35239 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35239)
;
35240 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
35241 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
35242 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
35243 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
35244 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
35245 }
35246
35247 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
35248 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
35249 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
35250 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
35251 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
35252
35253 MI.eraseFromParent(); // The pseudo is gone now.
35254 return BB;
35255 }
35256 case X86::PTILEZERO: {
35257 unsigned Imm = MI.getOperand(0).getImm();
35258 BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
35259 MI.eraseFromParent(); // The pseudo is gone now.
35260 return BB;
35261 }
35262 case X86::PTILELOADD:
35263 case X86::PTILELOADDT1:
35264 case X86::PTILESTORED: {
35265 unsigned Opc;
35266 switch (MI.getOpcode()) {
35267 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35267)
;
35268 case X86::PTILELOADD: Opc = X86::TILELOADD; break;
35269 case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
35270 case X86::PTILESTORED: Opc = X86::TILESTORED; break;
35271 }
35272
35273 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
35274 unsigned CurOp = 0;
35275 if (Opc != X86::TILESTORED)
35276 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
35277 RegState::Define);
35278
35279 MIB.add(MI.getOperand(CurOp++)); // base
35280 MIB.add(MI.getOperand(CurOp++)); // scale
35281 MIB.add(MI.getOperand(CurOp++)); // index -- stride
35282 MIB.add(MI.getOperand(CurOp++)); // displacement
35283 MIB.add(MI.getOperand(CurOp++)); // segment
35284
35285 if (Opc == X86::TILESTORED)
35286 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
35287 RegState::Undef);
35288
35289 MI.eraseFromParent(); // The pseudo is gone now.
35290 return BB;
35291 }
35292 }
35293}
35294
35295//===----------------------------------------------------------------------===//
35296// X86 Optimization Hooks
35297//===----------------------------------------------------------------------===//
35298
35299bool
35300X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
35301 const APInt &DemandedBits,
35302 const APInt &DemandedElts,
35303 TargetLoweringOpt &TLO) const {
35304 EVT VT = Op.getValueType();
35305 unsigned Opcode = Op.getOpcode();
35306 unsigned EltSize = VT.getScalarSizeInBits();
35307
35308 if (VT.isVector()) {
35309 // If the constant is only all signbits in the active bits, then we should
35310 // extend it to the entire constant to allow it act as a boolean constant
35311 // vector.
35312 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
35313 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
35314 return false;
35315 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
35316 if (!DemandedElts[i] || V.getOperand(i).isUndef())
35317 continue;
35318 const APInt &Val = V.getConstantOperandAPInt(i);
35319 if (Val.getBitWidth() > Val.getNumSignBits() &&
35320 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
35321 return true;
35322 }
35323 return false;
35324 };
35325 // For vectors - if we have a constant, then try to sign extend.
35326 // TODO: Handle AND/ANDN cases.
35327 unsigned ActiveBits = DemandedBits.getActiveBits();
35328 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
35329 (Opcode == ISD::OR || Opcode == ISD::XOR) &&
35330 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
35331 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
35332 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
35333 VT.getVectorNumElements());
35334 SDValue NewC =
35335 TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
35336 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
35337 SDValue NewOp =
35338 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
35339 return TLO.CombineTo(Op, NewOp);
35340 }
35341 return false;
35342 }
35343
35344 // Only optimize Ands to prevent shrinking a constant that could be
35345 // matched by movzx.
35346 if (Opcode != ISD::AND)
35347 return false;
35348
35349 // Make sure the RHS really is a constant.
35350 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
35351 if (!C)
35352 return false;
35353
35354 const APInt &Mask = C->getAPIntValue();
35355
35356 // Clear all non-demanded bits initially.
35357 APInt ShrunkMask = Mask & DemandedBits;
35358
35359 // Find the width of the shrunk mask.
35360 unsigned Width = ShrunkMask.getActiveBits();
35361
35362 // If the mask is all 0s there's nothing to do here.
35363 if (Width == 0)
35364 return false;
35365
35366 // Find the next power of 2 width, rounding up to a byte.
35367 Width = PowerOf2Ceil(std::max(Width, 8U));
35368 // Truncate the width to size to handle illegal types.
35369 Width = std::min(Width, EltSize);
35370
35371 // Calculate a possible zero extend mask for this constant.
35372 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
35373
35374 // If we aren't changing the mask, just return true to keep it and prevent
35375 // the caller from optimizing.
35376 if (ZeroExtendMask == Mask)
35377 return true;
35378
35379 // Make sure the new mask can be represented by a combination of mask bits
35380 // and non-demanded bits.
35381 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
35382 return false;
35383
35384 // Replace the constant with the zero extend mask.
35385 SDLoc DL(Op);
35386 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
35387 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
35388 return TLO.CombineTo(Op, NewOp);
35389}
35390
35391void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
35392 KnownBits &Known,
35393 const APInt &DemandedElts,
35394 const SelectionDAG &DAG,
35395 unsigned Depth) const {
35396 unsigned BitWidth = Known.getBitWidth();
35397 unsigned NumElts = DemandedElts.getBitWidth();
35398 unsigned Opc = Op.getOpcode();
35399 EVT VT = Op.getValueType();
35400 assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35405, __extension__ __PRETTY_FUNCTION__))
35401 Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35405, __extension__ __PRETTY_FUNCTION__))
35402 Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35405, __extension__ __PRETTY_FUNCTION__))
35403 Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35405, __extension__ __PRETTY_FUNCTION__))
35404 "Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35405, __extension__ __PRETTY_FUNCTION__))
35405 " is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35405, __extension__ __PRETTY_FUNCTION__))
;
35406
35407 Known.resetAll();
35408 switch (Opc) {
35409 default: break;
35410 case X86ISD::SETCC:
35411 Known.Zero.setBitsFrom(1);
35412 break;
35413 case X86ISD::MOVMSK: {
35414 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
35415 Known.Zero.setBitsFrom(NumLoBits);
35416 break;
35417 }
35418 case X86ISD::PEXTRB:
35419 case X86ISD::PEXTRW: {
35420 SDValue Src = Op.getOperand(0);
35421 EVT SrcVT = Src.getValueType();
35422 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
35423 Op.getConstantOperandVal(1));
35424 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
35425 Known = Known.anyextOrTrunc(BitWidth);
35426 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
35427 break;
35428 }
35429 case X86ISD::VSRAI:
35430 case X86ISD::VSHLI:
35431 case X86ISD::VSRLI: {
35432 unsigned ShAmt = Op.getConstantOperandVal(1);
35433 if (ShAmt >= VT.getScalarSizeInBits()) {
35434 Known.setAllZero();
35435 break;
35436 }
35437
35438 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
35439 if (Opc == X86ISD::VSHLI) {
35440 Known.Zero <<= ShAmt;
35441 Known.One <<= ShAmt;
35442 // Low bits are known zero.
35443 Known.Zero.setLowBits(ShAmt);
35444 } else if (Opc == X86ISD::VSRLI) {
35445 Known.Zero.lshrInPlace(ShAmt);
35446 Known.One.lshrInPlace(ShAmt);
35447 // High bits are known zero.
35448 Known.Zero.setHighBits(ShAmt);
35449 } else {
35450 Known.Zero.ashrInPlace(ShAmt);
35451 Known.One.ashrInPlace(ShAmt);
35452 }
35453 break;
35454 }
35455 case X86ISD::PACKUS: {
35456 // PACKUS is just a truncation if the upper half is zero.
35457 APInt DemandedLHS, DemandedRHS;
35458 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
35459
35460 Known.One = APInt::getAllOnesValue(BitWidth * 2);
35461 Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
35462
35463 KnownBits Known2;
35464 if (!!DemandedLHS) {
35465 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
35466 Known = KnownBits::commonBits(Known, Known2);
35467 }
35468 if (!!DemandedRHS) {
35469 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
35470 Known = KnownBits::commonBits(Known, Known2);
35471 }
35472
35473 if (Known.countMinLeadingZeros() < BitWidth)
35474 Known.resetAll();
35475 Known = Known.trunc(BitWidth);
35476 break;
35477 }
35478 case X86ISD::VBROADCAST: {
35479 SDValue Src = Op.getOperand(0);
35480 if (!Src.getSimpleValueType().isVector()) {
35481 Known = DAG.computeKnownBits(Src, Depth + 1);
35482 return;
35483 }
35484 break;
35485 }
35486 case X86ISD::ANDNP: {
35487 KnownBits Known2;
35488 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
35489 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
35490
35491 // ANDNP = (~X & Y);
35492 Known.One &= Known2.Zero;
35493 Known.Zero |= Known2.One;
35494 break;
35495 }
35496 case X86ISD::FOR: {
35497 KnownBits Known2;
35498 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
35499 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
35500
35501 Known |= Known2;
35502 break;
35503 }
35504 case X86ISD::PSADBW: {
35505 assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35507, __extension__ __PRETTY_FUNCTION__))
35506 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35507, __extension__ __PRETTY_FUNCTION__))
35507 "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35507, __extension__ __PRETTY_FUNCTION__))
;
35508
35509 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
35510 Known.Zero.setBitsFrom(16);
35511 break;
35512 }
35513 case X86ISD::PMULUDQ: {
35514 KnownBits Known2;
35515 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
35516 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
35517
35518 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
35519 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
35520 Known = KnownBits::mul(Known, Known2);
35521 break;
35522 }
35523 case X86ISD::CMOV: {
35524 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
35525 // If we don't know any bits, early out.
35526 if (Known.isUnknown())
35527 break;
35528 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
35529
35530 // Only known if known in both the LHS and RHS.
35531 Known = KnownBits::commonBits(Known, Known2);
35532 break;
35533 }
35534 case X86ISD::BEXTR:
35535 case X86ISD::BEXTRI: {
35536 SDValue Op0 = Op.getOperand(0);
35537 SDValue Op1 = Op.getOperand(1);
35538
35539 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
35540 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
35541 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
35542
35543 // If the length is 0, the result is 0.
35544 if (Length == 0) {
35545 Known.setAllZero();
35546 break;
35547 }
35548
35549 if ((Shift + Length) <= BitWidth) {
35550 Known = DAG.computeKnownBits(Op0, Depth + 1);
35551 Known = Known.extractBits(Length, Shift);
35552 Known = Known.zextOrTrunc(BitWidth);
35553 }
35554 }
35555 break;
35556 }
35557 case X86ISD::PDEP: {
35558 KnownBits Known2;
35559 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
35560 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
35561 // Zeros are retained from the mask operand. But not ones.
35562 Known.One.clearAllBits();
35563 // The result will have at least as many trailing zeros as the non-mask
35564 // operand since bits can only map to the same or higher bit position.
35565 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
35566 break;
35567 }
35568 case X86ISD::PEXT: {
35569 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
35570 // The result has as many leading zeros as the number of zeroes in the mask.
35571 unsigned Count = Known.Zero.countPopulation();
35572 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
35573 Known.One.clearAllBits();
35574 break;
35575 }
35576 case X86ISD::VTRUNC:
35577 case X86ISD::VTRUNCS:
35578 case X86ISD::VTRUNCUS:
35579 case X86ISD::CVTSI2P:
35580 case X86ISD::CVTUI2P:
35581 case X86ISD::CVTP2SI:
35582 case X86ISD::CVTP2UI:
35583 case X86ISD::MCVTP2SI:
35584 case X86ISD::MCVTP2UI:
35585 case X86ISD::CVTTP2SI:
35586 case X86ISD::CVTTP2UI:
35587 case X86ISD::MCVTTP2SI:
35588 case X86ISD::MCVTTP2UI:
35589 case X86ISD::MCVTSI2P:
35590 case X86ISD::MCVTUI2P:
35591 case X86ISD::VFPROUND:
35592 case X86ISD::VMFPROUND:
35593 case X86ISD::CVTPS2PH:
35594 case X86ISD::MCVTPS2PH: {
35595 // Truncations/Conversions - upper elements are known zero.
35596 EVT SrcVT = Op.getOperand(0).getValueType();
35597 if (SrcVT.isVector()) {
35598 unsigned NumSrcElts = SrcVT.getVectorNumElements();
35599 if (NumElts > NumSrcElts &&
35600 DemandedElts.countTrailingZeros() >= NumSrcElts)
35601 Known.setAllZero();
35602 }
35603 break;
35604 }
35605 case X86ISD::STRICT_CVTTP2SI:
35606 case X86ISD::STRICT_CVTTP2UI:
35607 case X86ISD::STRICT_CVTSI2P:
35608 case X86ISD::STRICT_CVTUI2P:
35609 case X86ISD::STRICT_VFPROUND:
35610 case X86ISD::STRICT_CVTPS2PH: {
35611 // Strict Conversions - upper elements are known zero.
35612 EVT SrcVT = Op.getOperand(1).getValueType();
35613 if (SrcVT.isVector()) {
35614 unsigned NumSrcElts = SrcVT.getVectorNumElements();
35615 if (NumElts > NumSrcElts &&
35616 DemandedElts.countTrailingZeros() >= NumSrcElts)
35617 Known.setAllZero();
35618 }
35619 break;
35620 }
35621 case X86ISD::MOVQ2DQ: {
35622 // Move from MMX to XMM. Upper half of XMM should be 0.
35623 if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
35624 Known.setAllZero();
35625 break;
35626 }
35627 }
35628
35629 // Handle target shuffles.
35630 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
35631 if (isTargetShuffle(Opc)) {
35632 SmallVector<int, 64> Mask;
35633 SmallVector<SDValue, 2> Ops;
35634 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
35635 unsigned NumOps = Ops.size();
35636 unsigned NumElts = VT.getVectorNumElements();
35637 if (Mask.size() == NumElts) {
35638 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
35639 Known.Zero.setAllBits(); Known.One.setAllBits();
35640 for (unsigned i = 0; i != NumElts; ++i) {
35641 if (!DemandedElts[i])
35642 continue;
35643 int M = Mask[i];
35644 if (M == SM_SentinelUndef) {
35645 // For UNDEF elements, we don't know anything about the common state
35646 // of the shuffle result.
35647 Known.resetAll();
35648 break;
35649 }
35650 if (M == SM_SentinelZero) {
35651 Known.One.clearAllBits();
35652 continue;
35653 }
35654 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35655, __extension__ __PRETTY_FUNCTION__))
35655 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35655, __extension__ __PRETTY_FUNCTION__))
;
35656
35657 unsigned OpIdx = (unsigned)M / NumElts;
35658 unsigned EltIdx = (unsigned)M % NumElts;
35659 if (Ops[OpIdx].getValueType() != VT) {
35660 // TODO - handle target shuffle ops with different value types.
35661 Known.resetAll();
35662 break;
35663 }
35664 DemandedOps[OpIdx].setBit(EltIdx);
35665 }
35666 // Known bits are the values that are shared by every demanded element.
35667 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
35668 if (!DemandedOps[i])
35669 continue;
35670 KnownBits Known2 =
35671 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
35672 Known = KnownBits::commonBits(Known, Known2);
35673 }
35674 }
35675 }
35676 }
35677}
35678
35679unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
35680 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
35681 unsigned Depth) const {
35682 EVT VT = Op.getValueType();
35683 unsigned VTBits = VT.getScalarSizeInBits();
35684 unsigned Opcode = Op.getOpcode();
35685 switch (Opcode) {
35686 case X86ISD::SETCC_CARRY:
35687 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
35688 return VTBits;
35689
35690 case X86ISD::VTRUNC: {
35691 SDValue Src = Op.getOperand(0);
35692 MVT SrcVT = Src.getSimpleValueType();
35693 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
35694 assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35694, __extension__ __PRETTY_FUNCTION__))
;
35695 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
35696 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
35697 if (Tmp > (NumSrcBits - VTBits))
35698 return Tmp - (NumSrcBits - VTBits);
35699 return 1;
35700 }
35701
35702 case X86ISD::PACKSS: {
35703 // PACKSS is just a truncation if the sign bits extend to the packed size.
35704 APInt DemandedLHS, DemandedRHS;
35705 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
35706 DemandedRHS);
35707
35708 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
35709 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
35710 if (!!DemandedLHS)
35711 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
35712 if (!!DemandedRHS)
35713 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
35714 unsigned Tmp = std::min(Tmp0, Tmp1);
35715 if (Tmp > (SrcBits - VTBits))
35716 return Tmp - (SrcBits - VTBits);
35717 return 1;
35718 }
35719
35720 case X86ISD::VBROADCAST: {
35721 SDValue Src = Op.getOperand(0);
35722 if (!Src.getSimpleValueType().isVector())
35723 return DAG.ComputeNumSignBits(Src, Depth + 1);
35724 break;
35725 }
35726
35727 case X86ISD::VSHLI: {
35728 SDValue Src = Op.getOperand(0);
35729 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
35730 if (ShiftVal.uge(VTBits))
35731 return VTBits; // Shifted all bits out --> zero.
35732 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35733 if (ShiftVal.uge(Tmp))
35734 return 1; // Shifted all sign bits out --> unknown.
35735 return Tmp - ShiftVal.getZExtValue();
35736 }
35737
35738 case X86ISD::VSRAI: {
35739 SDValue Src = Op.getOperand(0);
35740 APInt ShiftVal = Op.getConstantOperandAPInt(1);
35741 if (ShiftVal.uge(VTBits - 1))
35742 return VTBits; // Sign splat.
35743 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
35744 ShiftVal += Tmp;
35745 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
35746 }
35747
35748 case X86ISD::FSETCC:
35749 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
35750 if (VT == MVT::f32 || VT == MVT::f64 ||
35751 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
35752 return VTBits;
35753 break;
35754
35755 case X86ISD::PCMPGT:
35756 case X86ISD::PCMPEQ:
35757 case X86ISD::CMPP:
35758 case X86ISD::VPCOM:
35759 case X86ISD::VPCOMU:
35760 // Vector compares return zero/all-bits result values.
35761 return VTBits;
35762
35763 case X86ISD::ANDNP: {
35764 unsigned Tmp0 =
35765 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
35766 if (Tmp0 == 1) return 1; // Early out.
35767 unsigned Tmp1 =
35768 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
35769 return std::min(Tmp0, Tmp1);
35770 }
35771
35772 case X86ISD::CMOV: {
35773 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
35774 if (Tmp0 == 1) return 1; // Early out.
35775 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
35776 return std::min(Tmp0, Tmp1);
35777 }
35778 }
35779
35780 // Handle target shuffles.
35781 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
35782 if (isTargetShuffle(Opcode)) {
35783 SmallVector<int, 64> Mask;
35784 SmallVector<SDValue, 2> Ops;
35785 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {
35786 unsigned NumOps = Ops.size();
35787 unsigned NumElts = VT.getVectorNumElements();
35788 if (Mask.size() == NumElts) {
35789 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
35790 for (unsigned i = 0; i != NumElts; ++i) {
35791 if (!DemandedElts[i])
35792 continue;
35793 int M = Mask[i];
35794 if (M == SM_SentinelUndef) {
35795 // For UNDEF elements, we don't know anything about the common state
35796 // of the shuffle result.
35797 return 1;
35798 } else if (M == SM_SentinelZero) {
35799 // Zero = all sign bits.
35800 continue;
35801 }
35802 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35803, __extension__ __PRETTY_FUNCTION__))
35803 "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35803, __extension__ __PRETTY_FUNCTION__))
;
35804
35805 unsigned OpIdx = (unsigned)M / NumElts;
35806 unsigned EltIdx = (unsigned)M % NumElts;
35807 if (Ops[OpIdx].getValueType() != VT) {
35808 // TODO - handle target shuffle ops with different value types.
35809 return 1;
35810 }
35811 DemandedOps[OpIdx].setBit(EltIdx);
35812 }
35813 unsigned Tmp0 = VTBits;
35814 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
35815 if (!DemandedOps[i])
35816 continue;
35817 unsigned Tmp1 =
35818 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
35819 Tmp0 = std::min(Tmp0, Tmp1);
35820 }
35821 return Tmp0;
35822 }
35823 }
35824 }
35825
35826 // Fallback case.
35827 return 1;
35828}
35829
35830SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
35831 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
35832 return N->getOperand(0);
35833 return N;
35834}
35835
35836// Helper to look for a normal load that can be narrowed into a vzload with the
35837// specified VT and memory VT. Returns SDValue() on failure.
35838static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
35839 SelectionDAG &DAG) {
35840 // Can't if the load is volatile or atomic.
35841 if (!LN->isSimple())
35842 return SDValue();
35843
35844 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35845 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
35846 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
35847 LN->getPointerInfo(), LN->getOriginalAlign(),
35848 LN->getMemOperand()->getFlags());
35849}
35850
35851// Attempt to match a combined shuffle mask against supported unary shuffle
35852// instructions.
35853// TODO: Investigate sharing more of this with shuffle lowering.
35854static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
35855 bool AllowFloatDomain, bool AllowIntDomain,
35856 SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
35857 const X86Subtarget &Subtarget, unsigned &Shuffle,
35858 MVT &SrcVT, MVT &DstVT) {
35859 unsigned NumMaskElts = Mask.size();
35860 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
35861
35862 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
35863 if (Mask[0] == 0 &&
35864 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
35865 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
35866 (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35867 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
35868 Shuffle = X86ISD::VZEXT_MOVL;
35869 SrcVT = DstVT =
35870 !Subtarget.hasSSE2() && MaskEltSize == 32 ? MVT::v4f32 : MaskVT;
35871 return true;
35872 }
35873 }
35874
35875 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
35876 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
35877 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
35878 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
35879 unsigned MaxScale = 64 / MaskEltSize;
35880 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
35881 bool MatchAny = true;
35882 bool MatchZero = true;
35883 unsigned NumDstElts = NumMaskElts / Scale;
35884 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
35885 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
35886 MatchAny = MatchZero = false;
35887 break;
35888 }
35889 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
35890 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
35891 }
35892 if (MatchAny || MatchZero) {
35893 assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35893, __extension__ __PRETTY_FUNCTION__))
;
35894 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
35895 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
35896 MVT::getIntegerVT(MaskEltSize);
35897 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
35898
35899 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
35900 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
35901
35902 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
35903 if (SrcVT.getVectorNumElements() != NumDstElts)
35904 Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
35905
35906 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
35907 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
35908 return true;
35909 }
35910 }
35911 }
35912
35913 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
35914 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
35915 isUndefOrEqual(Mask[0], 0) &&
35916 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
35917 Shuffle = X86ISD::VZEXT_MOVL;
35918 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
35919 return true;
35920 }
35921
35922 // Check if we have SSE3 which will let us use MOVDDUP etc. The
35923 // instructions are no slower than UNPCKLPD but has the option to
35924 // fold the input operand into even an unaligned memory load.
35925 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
35926 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, V1)) {
35927 Shuffle = X86ISD::MOVDDUP;
35928 SrcVT = DstVT = MVT::v2f64;
35929 return true;
35930 }
35931 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35932 Shuffle = X86ISD::MOVSLDUP;
35933 SrcVT = DstVT = MVT::v4f32;
35934 return true;
35935 }
35936 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, V1)) {
35937 Shuffle = X86ISD::MOVSHDUP;
35938 SrcVT = DstVT = MVT::v4f32;
35939 return true;
35940 }
35941 }
35942
35943 if (MaskVT.is256BitVector() && AllowFloatDomain) {
35944 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35944, __extension__ __PRETTY_FUNCTION__))
;
35945 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, V1)) {
35946 Shuffle = X86ISD::MOVDDUP;
35947 SrcVT = DstVT = MVT::v4f64;
35948 return true;
35949 }
35950 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35951 Shuffle = X86ISD::MOVSLDUP;
35952 SrcVT = DstVT = MVT::v8f32;
35953 return true;
35954 }
35955 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
35956 Shuffle = X86ISD::MOVSHDUP;
35957 SrcVT = DstVT = MVT::v8f32;
35958 return true;
35959 }
35960 }
35961
35962 if (MaskVT.is512BitVector() && AllowFloatDomain) {
35963 assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35964, __extension__ __PRETTY_FUNCTION__))
35964 "AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35964, __extension__ __PRETTY_FUNCTION__))
;
35965 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
35966 Shuffle = X86ISD::MOVDDUP;
35967 SrcVT = DstVT = MVT::v8f64;
35968 return true;
35969 }
35970 if (isTargetShuffleEquivalent(
35971 MaskVT, Mask,
35972 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
35973 Shuffle = X86ISD::MOVSLDUP;
35974 SrcVT = DstVT = MVT::v16f32;
35975 return true;
35976 }
35977 if (isTargetShuffleEquivalent(
35978 MaskVT, Mask,
35979 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
35980 Shuffle = X86ISD::MOVSHDUP;
35981 SrcVT = DstVT = MVT::v16f32;
35982 return true;
35983 }
35984 }
35985
35986 return false;
35987}
35988
35989// Attempt to match a combined shuffle mask against supported unary immediate
35990// permute instructions.
35991// TODO: Investigate sharing more of this with shuffle lowering.
35992static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
35993 const APInt &Zeroable,
35994 bool AllowFloatDomain, bool AllowIntDomain,
35995 const X86Subtarget &Subtarget,
35996 unsigned &Shuffle, MVT &ShuffleVT,
35997 unsigned &PermuteImm) {
35998 unsigned NumMaskElts = Mask.size();
35999 unsigned InputSizeInBits = MaskVT.getSizeInBits();
36000 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
36001 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
36002 bool ContainsZeros = isAnyZero(Mask);
36003
36004 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
36005 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
36006 // Check for lane crossing permutes.
36007 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
36008 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
36009 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
36010 Shuffle = X86ISD::VPERMI;
36011 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
36012 PermuteImm = getV4X86ShuffleImm(Mask);
36013 return true;
36014 }
36015 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
36016 SmallVector<int, 4> RepeatedMask;
36017 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
36018 Shuffle = X86ISD::VPERMI;
36019 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
36020 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
36021 return true;
36022 }
36023 }
36024 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
36025 // VPERMILPD can permute with a non-repeating shuffle.
36026 Shuffle = X86ISD::VPERMILPI;
36027 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
36028 PermuteImm = 0;
36029 for (int i = 0, e = Mask.size(); i != e; ++i) {
36030 int M = Mask[i];
36031 if (M == SM_SentinelUndef)
36032 continue;
36033 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36033, __extension__ __PRETTY_FUNCTION__))
;
36034 PermuteImm |= (M & 1) << i;
36035 }
36036 return true;
36037 }
36038 }
36039
36040 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
36041 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
36042 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
36043 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
36044 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
36045 SmallVector<int, 4> RepeatedMask;
36046 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
36047 // Narrow the repeated mask to create 32-bit element permutes.
36048 SmallVector<int, 4> WordMask = RepeatedMask;
36049 if (MaskScalarSizeInBits == 64)
36050 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
36051
36052 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
36053 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
36054 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
36055 PermuteImm = getV4X86ShuffleImm(WordMask);
36056 return true;
36057 }
36058 }
36059
36060 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
36061 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
36062 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36063 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36064 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
36065 SmallVector<int, 4> RepeatedMask;
36066 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
36067 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
36068 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
36069
36070 // PSHUFLW: permute lower 4 elements only.
36071 if (isUndefOrInRange(LoMask, 0, 4) &&
36072 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
36073 Shuffle = X86ISD::PSHUFLW;
36074 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
36075 PermuteImm = getV4X86ShuffleImm(LoMask);
36076 return true;
36077 }
36078
36079 // PSHUFHW: permute upper 4 elements only.
36080 if (isUndefOrInRange(HiMask, 4, 8) &&
36081 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
36082 // Offset the HiMask so that we can create the shuffle immediate.
36083 int OffsetHiMask[4];
36084 for (int i = 0; i != 4; ++i)
36085 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
36086
36087 Shuffle = X86ISD::PSHUFHW;
36088 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
36089 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
36090 return true;
36091 }
36092 }
36093 }
36094
36095 // Attempt to match against byte/bit shifts.
36096 if (AllowIntDomain &&
36097 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36098 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36099 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36100 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
36101 Mask, 0, Zeroable, Subtarget);
36102 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
36103 32 <= ShuffleVT.getScalarSizeInBits())) {
36104 PermuteImm = (unsigned)ShiftAmt;
36105 return true;
36106 }
36107 }
36108
36109 // Attempt to match against bit rotates.
36110 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
36111 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
36112 Subtarget.hasAVX512())) {
36113 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
36114 Subtarget, Mask);
36115 if (0 < RotateAmt) {
36116 Shuffle = X86ISD::VROTLI;
36117 PermuteImm = (unsigned)RotateAmt;
36118 return true;
36119 }
36120 }
36121
36122 return false;
36123}
36124
36125// Attempt to match a combined unary shuffle mask against supported binary
36126// shuffle instructions.
36127// TODO: Investigate sharing more of this with shuffle lowering.
36128static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
36129 bool AllowFloatDomain, bool AllowIntDomain,
36130 SDValue &V1, SDValue &V2, const SDLoc &DL,
36131 SelectionDAG &DAG, const X86Subtarget &Subtarget,
36132 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
36133 bool IsUnary) {
36134 unsigned NumMaskElts = Mask.size();
36135 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
36136
36137 if (MaskVT.is128BitVector()) {
36138 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}) && AllowFloatDomain) {
36139 V2 = V1;
36140 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
36141 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
36142 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
36143 return true;
36144 }
36145 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}) && AllowFloatDomain) {
36146 V2 = V1;
36147 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
36148 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
36149 return true;
36150 }
36151 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}) &&
36152 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
36153 std::swap(V1, V2);
36154 Shuffle = X86ISD::MOVSD;
36155 SrcVT = DstVT = MVT::v2f64;
36156 return true;
36157 }
36158 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}) &&
36159 (AllowFloatDomain || !Subtarget.hasSSE41())) {
36160 Shuffle = X86ISD::MOVSS;
36161 SrcVT = DstVT = MVT::v4f32;
36162 return true;
36163 }
36164 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) &&
36165 Subtarget.hasFP16()) {
36166 Shuffle = X86ISD::MOVSH;
36167 SrcVT = DstVT = MVT::v8f16;
36168 return true;
36169 }
36170 }
36171
36172 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
36173 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
36174 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
36175 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
36176 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
36177 Subtarget)) {
36178 DstVT = MaskVT;
36179 return true;
36180 }
36181 }
36182
36183 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
36184 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
36185 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36186 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
36187 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36188 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
36189 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
36190 Subtarget)) {
36191 SrcVT = DstVT = MaskVT;
36192 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
36193 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
36194 return true;
36195 }
36196 }
36197
36198 // Attempt to match against a OR if we're performing a blend shuffle and the
36199 // non-blended source element is zero in each case.
36200 if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
36201 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
36202 bool IsBlend = true;
36203 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
36204 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
36205 unsigned Scale1 = NumV1Elts / NumMaskElts;
36206 unsigned Scale2 = NumV2Elts / NumMaskElts;
36207 APInt DemandedZeroV1 = APInt::getNullValue(NumV1Elts);
36208 APInt DemandedZeroV2 = APInt::getNullValue(NumV2Elts);
36209 for (unsigned i = 0; i != NumMaskElts; ++i) {
36210 int M = Mask[i];
36211 if (M == SM_SentinelUndef)
36212 continue;
36213 if (M == SM_SentinelZero) {
36214 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
36215 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
36216 continue;
36217 }
36218 if (M == (int)i) {
36219 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
36220 continue;
36221 }
36222 if (M == (int)(i + NumMaskElts)) {
36223 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
36224 continue;
36225 }
36226 IsBlend = false;
36227 break;
36228 }
36229 if (IsBlend &&
36230 DAG.computeKnownBits(V1, DemandedZeroV1).isZero() &&
36231 DAG.computeKnownBits(V2, DemandedZeroV2).isZero()) {
36232 Shuffle = ISD::OR;
36233 SrcVT = DstVT = MaskVT.changeTypeToInteger();
36234 return true;
36235 }
36236 }
36237
36238 return false;
36239}
36240
36241static bool matchBinaryPermuteShuffle(
36242 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
36243 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
36244 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
36245 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
36246 unsigned NumMaskElts = Mask.size();
36247 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
36248
36249 // Attempt to match against VALIGND/VALIGNQ rotate.
36250 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
36251 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
36252 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
36253 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36254 if (!isAnyZero(Mask)) {
36255 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
36256 if (0 < Rotation) {
36257 Shuffle = X86ISD::VALIGN;
36258 if (EltSizeInBits == 64)
36259 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
36260 else
36261 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
36262 PermuteImm = Rotation;
36263 return true;
36264 }
36265 }
36266 }
36267
36268 // Attempt to match against PALIGNR byte rotate.
36269 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
36270 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
36271 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
36272 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
36273 if (0 < ByteRotation) {
36274 Shuffle = X86ISD::PALIGNR;
36275 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
36276 PermuteImm = ByteRotation;
36277 return true;
36278 }
36279 }
36280
36281 // Attempt to combine to X86ISD::BLENDI.
36282 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
36283 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
36284 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
36285 uint64_t BlendMask = 0;
36286 bool ForceV1Zero = false, ForceV2Zero = false;
36287 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
36288 if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
36289 ForceV2Zero, BlendMask)) {
36290 if (MaskVT == MVT::v16i16) {
36291 // We can only use v16i16 PBLENDW if the lanes are repeated.
36292 SmallVector<int, 8> RepeatedMask;
36293 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
36294 RepeatedMask)) {
36295 assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36296, __extension__ __PRETTY_FUNCTION__))
36296 "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36296, __extension__ __PRETTY_FUNCTION__))
;
36297 PermuteImm = 0;
36298 for (int i = 0; i < 8; ++i)
36299 if (RepeatedMask[i] >= 8)
36300 PermuteImm |= 1 << i;
36301 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
36302 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
36303 Shuffle = X86ISD::BLENDI;
36304 ShuffleVT = MaskVT;
36305 return true;
36306 }
36307 } else {
36308 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
36309 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
36310 PermuteImm = (unsigned)BlendMask;
36311 Shuffle = X86ISD::BLENDI;
36312 ShuffleVT = MaskVT;
36313 return true;
36314 }
36315 }
36316 }
36317
36318 // Attempt to combine to INSERTPS, but only if it has elements that need to
36319 // be set to zero.
36320 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
36321 MaskVT.is128BitVector() && isAnyZero(Mask) &&
36322 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
36323 Shuffle = X86ISD::INSERTPS;
36324 ShuffleVT = MVT::v4f32;
36325 return true;
36326 }
36327
36328 // Attempt to combine to SHUFPD.
36329 if (AllowFloatDomain && EltSizeInBits == 64 &&
36330 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
36331 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
36332 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36333 bool ForceV1Zero = false, ForceV2Zero = false;
36334 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
36335 PermuteImm, Mask, Zeroable)) {
36336 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
36337 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
36338 Shuffle = X86ISD::SHUFP;
36339 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
36340 return true;
36341 }
36342 }
36343
36344 // Attempt to combine to SHUFPS.
36345 if (AllowFloatDomain && EltSizeInBits == 32 &&
36346 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
36347 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
36348 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
36349 SmallVector<int, 4> RepeatedMask;
36350 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
36351 // Match each half of the repeated mask, to determine if its just
36352 // referencing one of the vectors, is zeroable or entirely undef.
36353 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
36354 int M0 = RepeatedMask[Offset];
36355 int M1 = RepeatedMask[Offset + 1];
36356
36357 if (isUndefInRange(RepeatedMask, Offset, 2)) {
36358 return DAG.getUNDEF(MaskVT);
36359 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
36360 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
36361 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
36362 return getZeroVector(MaskVT, Subtarget, DAG, DL);
36363 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
36364 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
36365 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
36366 return V1;
36367 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
36368 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
36369 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
36370 return V2;
36371 }
36372
36373 return SDValue();
36374 };
36375
36376 int ShufMask[4] = {-1, -1, -1, -1};
36377 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
36378 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
36379
36380 if (Lo && Hi) {
36381 V1 = Lo;
36382 V2 = Hi;
36383 Shuffle = X86ISD::SHUFP;
36384 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
36385 PermuteImm = getV4X86ShuffleImm(ShufMask);
36386 return true;
36387 }
36388 }
36389 }
36390
36391 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
36392 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
36393 MaskVT.is128BitVector() &&
36394 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
36395 Shuffle = X86ISD::INSERTPS;
36396 ShuffleVT = MVT::v4f32;
36397 return true;
36398 }
36399
36400 return false;
36401}
36402
36403static SDValue combineX86ShuffleChainWithExtract(
36404 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
36405 bool HasVariableMask, bool AllowVariableCrossLaneMask,
36406 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
36407 const X86Subtarget &Subtarget);
36408
36409/// Combine an arbitrary chain of shuffles into a single instruction if
36410/// possible.
36411///
36412/// This is the leaf of the recursive combine below. When we have found some
36413/// chain of single-use x86 shuffle instructions and accumulated the combined
36414/// shuffle mask represented by them, this will try to pattern match that mask
36415/// into either a single instruction if there is a special purpose instruction
36416/// for this operation, or into a PSHUFB instruction which is a fully general
36417/// instruction but should only be used to replace chains over a certain depth.
36418static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
36419 ArrayRef<int> BaseMask, int Depth,
36420 bool HasVariableMask,
36421 bool AllowVariableCrossLaneMask,
36422 bool AllowVariablePerLaneMask,
36423 SelectionDAG &DAG,
36424 const X86Subtarget &Subtarget) {
36425 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36425, __extension__ __PRETTY_FUNCTION__))
;
36426 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36427, __extension__ __PRETTY_FUNCTION__))
36427 "Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36427, __extension__ __PRETTY_FUNCTION__))
;
36428
36429 MVT RootVT = Root.getSimpleValueType();
36430 unsigned RootSizeInBits = RootVT.getSizeInBits();
36431 unsigned NumRootElts = RootVT.getVectorNumElements();
36432
36433 // Canonicalize shuffle input op to the requested type.
36434 // TODO: Support cases where Op is smaller than VT.
36435 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
36436 return DAG.getBitcast(VT, Op);
36437 };
36438
36439 // Find the inputs that enter the chain. Note that multiple uses are OK
36440 // here, we're not going to remove the operands we find.
36441 bool UnaryShuffle = (Inputs.size() == 1);
36442 SDValue V1 = peekThroughBitcasts(Inputs[0]);
36443 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
36444 : peekThroughBitcasts(Inputs[1]));
36445
36446 MVT VT1 = V1.getSimpleValueType();
36447 MVT VT2 = V2.getSimpleValueType();
36448 assert(VT1.getSizeInBits() == RootSizeInBits &&(static_cast <bool> (VT1.getSizeInBits() == RootSizeInBits
&& VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch"
) ? void (0) : __assert_fail ("VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits() == RootSizeInBits && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36449, __extension__ __PRETTY_FUNCTION__))
36449 VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch")(static_cast <bool> (VT1.getSizeInBits() == RootSizeInBits
&& VT2.getSizeInBits() == RootSizeInBits && "Vector size mismatch"
) ? void (0) : __assert_fail ("VT1.getSizeInBits() == RootSizeInBits && VT2.getSizeInBits() == RootSizeInBits && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36449, __extension__ __PRETTY_FUNCTION__))
;
36450
36451 SDLoc DL(Root);
36452 SDValue Res;
36453
36454 unsigned NumBaseMaskElts = BaseMask.size();
36455 if (NumBaseMaskElts == 1) {
36456 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36456, __extension__ __PRETTY_FUNCTION__))
;
36457 return CanonicalizeShuffleInput(RootVT, V1);
36458 }
36459
36460 bool OptForSize = DAG.shouldOptForSize();
36461 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
36462 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
36463 (RootVT.isFloatingPoint() && Depth >= 1) ||
36464 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
36465
36466 // Don't combine if we are a AVX512/EVEX target and the mask element size
36467 // is different from the root element size - this would prevent writemasks
36468 // from being reused.
36469 bool IsMaskedShuffle = false;
36470 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
36471 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
36472 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
36473 IsMaskedShuffle = true;
36474 }
36475 }
36476
36477 // If we are shuffling a broadcast (and not introducing zeros) then
36478 // we can just use the broadcast directly. This works for smaller broadcast
36479 // elements as well as they already repeat across each mask element
36480 if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
36481 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
36482 V1.getValueSizeInBits() >= RootSizeInBits) {
36483 return CanonicalizeShuffleInput(RootVT, V1);
36484 }
36485
36486 SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
36487
36488 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
36489 // etc. can be simplified.
36490 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
36491 SmallVector<int> ScaledMask, IdentityMask;
36492 unsigned NumElts = VT1.getVectorNumElements();
36493 if (Mask.size() <= NumElts &&
36494 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
36495 for (unsigned i = 0; i != NumElts; ++i)
36496 IdentityMask.push_back(i);
36497 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
36498 return CanonicalizeShuffleInput(RootVT, V1);
36499 }
36500 }
36501
36502 // Handle 128/256-bit lane shuffles of 512-bit vectors.
36503 if (RootVT.is512BitVector() &&
36504 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
36505 // If the upper subvectors are zeroable, then an extract+insert is more
36506 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
36507 // to zero the upper subvectors.
36508 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
36509 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
36510 return SDValue(); // Nothing to do!
36511 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36512, __extension__ __PRETTY_FUNCTION__))
36512 "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36512, __extension__ __PRETTY_FUNCTION__))
;
36513 Res = CanonicalizeShuffleInput(RootVT, V1);
36514 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
36515 bool UseZero = isAnyZero(Mask);
36516 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
36517 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
36518 }
36519
36520 // Narrow shuffle mask to v4x128.
36521 SmallVector<int, 4> ScaledMask;
36522 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36522, __extension__ __PRETTY_FUNCTION__))
;
36523 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
36524
36525 // Try to lower to vshuf64x2/vshuf32x4.
36526 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
36527 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
36528 SelectionDAG &DAG) {
36529 unsigned PermMask = 0;
36530 // Insure elements came from the same Op.
36531 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
36532 for (int i = 0; i < 4; ++i) {
36533 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36533, __extension__ __PRETTY_FUNCTION__))
;
36534 if (ScaledMask[i] < 0)
36535 continue;
36536
36537 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
36538 unsigned OpIndex = i / 2;
36539 if (Ops[OpIndex].isUndef())
36540 Ops[OpIndex] = Op;
36541 else if (Ops[OpIndex] != Op)
36542 return SDValue();
36543
36544 // Convert the 128-bit shuffle mask selection values into 128-bit
36545 // selection bits defined by a vshuf64x2 instruction's immediate control
36546 // byte.
36547 PermMask |= (ScaledMask[i] % 4) << (i * 2);
36548 }
36549
36550 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
36551 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
36552 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
36553 DAG.getTargetConstant(PermMask, DL, MVT::i8));
36554 };
36555
36556 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
36557 // doesn't work because our mask is for 128 bits and we don't have an MVT
36558 // to match that.
36559 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
36560 isUndefOrInRange(ScaledMask[1], 0, 2) &&
36561 isUndefOrInRange(ScaledMask[2], 2, 4) &&
36562 isUndefOrInRange(ScaledMask[3], 2, 4) &&
36563 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
36564 ScaledMask[0] == (ScaledMask[2] % 2)) &&
36565 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
36566 ScaledMask[1] == (ScaledMask[3] % 2));
36567
36568 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
36569 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
36570 return SDValue(); // Nothing to do!
36571 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
36572 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
36573 return DAG.getBitcast(RootVT, V);
36574 }
36575 }
36576
36577 // Handle 128-bit lane shuffles of 256-bit vectors.
36578 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
36579 // If the upper half is zeroable, then an extract+insert is more optimal
36580 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
36581 // zero the upper half.
36582 if (isUndefOrZero(Mask[1])) {
36583 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
36584 return SDValue(); // Nothing to do!
36585 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36585, __extension__ __PRETTY_FUNCTION__))
;
36586 Res = CanonicalizeShuffleInput(RootVT, V1);
36587 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
36588 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
36589 256);
36590 }
36591
36592 // If we're splatting the low subvector, an insert-subvector 'concat'
36593 // pattern is quicker than VPERM2X128.
36594 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
36595 if (Mask[0] == 0 && Mask[1] == 0 && !Subtarget.hasAVX2()) {
36596 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
36597 return SDValue(); // Nothing to do!
36598 Res = CanonicalizeShuffleInput(RootVT, V1);
36599 Res = extractSubVector(Res, 0, DAG, DL, 128);
36600 return concatSubVectors(Res, Res, DAG, DL);
36601 }
36602
36603 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
36604 return SDValue(); // Nothing to do!
36605
36606 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
36607 // we need to use the zeroing feature.
36608 // Prefer blends for sequential shuffles unless we are optimizing for size.
36609 if (UnaryShuffle &&
36610 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
36611 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
36612 unsigned PermMask = 0;
36613 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
36614 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
36615 return DAG.getNode(
36616 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
36617 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
36618 }
36619
36620 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
36621 return SDValue(); // Nothing to do!
36622
36623 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
36624 if (!UnaryShuffle && !IsMaskedShuffle) {
36625 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36626, __extension__ __PRETTY_FUNCTION__))
36626 "Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36626, __extension__ __PRETTY_FUNCTION__))
;
36627 // Prefer blends to X86ISD::VPERM2X128.
36628 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
36629 unsigned PermMask = 0;
36630 PermMask |= ((Mask[0] & 3) << 0);
36631 PermMask |= ((Mask[1] & 3) << 4);
36632 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
36633 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
36634 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
36635 CanonicalizeShuffleInput(RootVT, LHS),
36636 CanonicalizeShuffleInput(RootVT, RHS),
36637 DAG.getTargetConstant(PermMask, DL, MVT::i8));
36638 }
36639 }
36640 }
36641
36642 // For masks that have been widened to 128-bit elements or more,
36643 // narrow back down to 64-bit elements.
36644 if (BaseMaskEltSizeInBits > 64) {
36645 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36645, __extension__ __PRETTY_FUNCTION__))
;
36646 int MaskScale = BaseMaskEltSizeInBits / 64;
36647 SmallVector<int, 64> ScaledMask;
36648 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
36649 Mask = std::move(ScaledMask);
36650 }
36651
36652 // For masked shuffles, we're trying to match the root width for better
36653 // writemask folding, attempt to scale the mask.
36654 // TODO - variable shuffles might need this to be widened again.
36655 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
36656 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36656, __extension__ __PRETTY_FUNCTION__))
;
36657 int MaskScale = NumRootElts / Mask.size();
36658 SmallVector<int, 64> ScaledMask;
36659 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
36660 Mask = std::move(ScaledMask);
36661 }
36662
36663 unsigned NumMaskElts = Mask.size();
36664 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
36665
36666 // Determine the effective mask value type.
36667 FloatDomain &= (32 <= MaskEltSizeInBits);
36668 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
36669 : MVT::getIntegerVT(MaskEltSizeInBits);
36670 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
36671
36672 // Only allow legal mask types.
36673 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
36674 return SDValue();
36675
36676 // Attempt to match the mask against known shuffle patterns.
36677 MVT ShuffleSrcVT, ShuffleVT;
36678 unsigned Shuffle, PermuteImm;
36679
36680 // Which shuffle domains are permitted?
36681 // Permit domain crossing at higher combine depths.
36682 // TODO: Should we indicate which domain is preferred if both are allowed?
36683 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
36684 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
36685 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
36686
36687 // Determine zeroable mask elements.
36688 APInt KnownUndef, KnownZero;
36689 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
36690 APInt Zeroable = KnownUndef | KnownZero;
36691
36692 if (UnaryShuffle) {
36693 // Attempt to match against broadcast-from-vector.
36694 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
36695 if ((Subtarget.hasAVX2() ||
36696 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
36697 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
36698 if (isUndefOrEqual(Mask, 0)) {
36699 if (V1.getValueType() == MaskVT &&
36700 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36701 MayFoldLoad(V1.getOperand(0))) {
36702 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36703 return SDValue(); // Nothing to do!
36704 Res = V1.getOperand(0);
36705 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36706 return DAG.getBitcast(RootVT, Res);
36707 }
36708 if (Subtarget.hasAVX2()) {
36709 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
36710 return SDValue(); // Nothing to do!
36711 Res = CanonicalizeShuffleInput(MaskVT, V1);
36712 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
36713 return DAG.getBitcast(RootVT, Res);
36714 }
36715 }
36716 }
36717
36718 SDValue NewV1 = V1; // Save operand in case early exit happens.
36719 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36720 DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36721 ShuffleVT) &&
36722 (!IsMaskedShuffle ||
36723 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36724 if (Depth == 0 && Root.getOpcode() == Shuffle)
36725 return SDValue(); // Nothing to do!
36726 Res = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36727 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
36728 return DAG.getBitcast(RootVT, Res);
36729 }
36730
36731 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36732 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
36733 PermuteImm) &&
36734 (!IsMaskedShuffle ||
36735 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36736 if (Depth == 0 && Root.getOpcode() == Shuffle)
36737 return SDValue(); // Nothing to do!
36738 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
36739 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
36740 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36741 return DAG.getBitcast(RootVT, Res);
36742 }
36743 }
36744
36745 // Attempt to combine to INSERTPS, but only if the inserted element has come
36746 // from a scalar.
36747 // TODO: Handle other insertions here as well?
36748 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
36749 Subtarget.hasSSE41() &&
36750 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3})) {
36751 if (MaskEltSizeInBits == 32) {
36752 SDValue SrcV1 = V1, SrcV2 = V2;
36753 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
36754 DAG) &&
36755 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
36756 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36757 return SDValue(); // Nothing to do!
36758 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36759 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
36760 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
36761 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36762 return DAG.getBitcast(RootVT, Res);
36763 }
36764 }
36765 if (MaskEltSizeInBits == 64 &&
36766 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}) &&
36767 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36768 V2.getScalarValueSizeInBits() <= 32) {
36769 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
36770 return SDValue(); // Nothing to do!
36771 PermuteImm = (/*DstIdx*/2 << 4) | (/*SrcIdx*/0 << 0);
36772 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
36773 CanonicalizeShuffleInput(MVT::v4f32, V1),
36774 CanonicalizeShuffleInput(MVT::v4f32, V2),
36775 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36776 return DAG.getBitcast(RootVT, Res);
36777 }
36778 }
36779
36780 SDValue NewV1 = V1; // Save operands in case early exit happens.
36781 SDValue NewV2 = V2;
36782 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
36783 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
36784 ShuffleVT, UnaryShuffle) &&
36785 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36786 if (Depth == 0 && Root.getOpcode() == Shuffle)
36787 return SDValue(); // Nothing to do!
36788 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
36789 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
36790 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
36791 return DAG.getBitcast(RootVT, Res);
36792 }
36793
36794 NewV1 = V1; // Save operands in case early exit happens.
36795 NewV2 = V2;
36796 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
36797 AllowIntDomain, NewV1, NewV2, DL, DAG,
36798 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
36799 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
36800 if (Depth == 0 && Root.getOpcode() == Shuffle)
36801 return SDValue(); // Nothing to do!
36802 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
36803 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
36804 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
36805 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
36806 return DAG.getBitcast(RootVT, Res);
36807 }
36808
36809 // Typically from here on, we need an integer version of MaskVT.
36810 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
36811 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
36812
36813 // Annoyingly, SSE4A instructions don't map into the above match helpers.
36814 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
36815 uint64_t BitLen, BitIdx;
36816 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
36817 Zeroable)) {
36818 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
36819 return SDValue(); // Nothing to do!
36820 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36821 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
36822 DAG.getTargetConstant(BitLen, DL, MVT::i8),
36823 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36824 return DAG.getBitcast(RootVT, Res);
36825 }
36826
36827 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
36828 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
36829 return SDValue(); // Nothing to do!
36830 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
36831 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
36832 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
36833 DAG.getTargetConstant(BitLen, DL, MVT::i8),
36834 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
36835 return DAG.getBitcast(RootVT, Res);
36836 }
36837 }
36838
36839 // Match shuffle against TRUNCATE patterns.
36840 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
36841 // Match against a VTRUNC instruction, accounting for src/dst sizes.
36842 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
36843 Subtarget)) {
36844 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
36845 ShuffleSrcVT.getVectorNumElements();
36846 unsigned Opc =
36847 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
36848 if (Depth == 0 && Root.getOpcode() == Opc)
36849 return SDValue(); // Nothing to do!
36850 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36851 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
36852 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
36853 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
36854 return DAG.getBitcast(RootVT, Res);
36855 }
36856
36857 // Do we need a more general binary truncation pattern?
36858 if (RootSizeInBits < 512 &&
36859 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
36860 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
36861 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
36862 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
36863 if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
36864 return SDValue(); // Nothing to do!
36865 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36866 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
36867 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
36868 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
36869 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
36870 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
36871 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
36872 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
36873 return DAG.getBitcast(RootVT, Res);
36874 }
36875 }
36876
36877 // Don't try to re-form single instruction chains under any circumstances now
36878 // that we've done encoding canonicalization for them.
36879 if (Depth < 1)
36880 return SDValue();
36881
36882 // Depth threshold above which we can efficiently use variable mask shuffles.
36883 int VariableCrossLaneShuffleDepth =
36884 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
36885 int VariablePerLaneShuffleDepth =
36886 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
36887 AllowVariableCrossLaneMask &=
36888 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
36889 AllowVariablePerLaneMask &=
36890 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
36891 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
36892 // higher depth before combining them.
36893 bool AllowBWIVPERMV3 =
36894 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
36895
36896 bool MaskContainsZeros = isAnyZero(Mask);
36897
36898 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
36899 // If we have a single input lane-crossing shuffle then lower to VPERMV.
36900 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
36901 if (Subtarget.hasAVX2() &&
36902 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
36903 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
36904 Res = CanonicalizeShuffleInput(MaskVT, V1);
36905 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
36906 return DAG.getBitcast(RootVT, Res);
36907 }
36908 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
36909 if ((Subtarget.hasAVX512() &&
36910 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36911 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36912 (Subtarget.hasBWI() &&
36913 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36914 (Subtarget.hasVBMI() &&
36915 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
36916 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36917 V2 = DAG.getUNDEF(MaskVT);
36918 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36919 return DAG.getBitcast(RootVT, Res);
36920 }
36921 }
36922
36923 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
36924 // vector as the second source (non-VLX will pad to 512-bit shuffles).
36925 if (UnaryShuffle && AllowVariableCrossLaneMask &&
36926 ((Subtarget.hasAVX512() &&
36927 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36928 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36929 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
36930 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
36931 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36932 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36933 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36934 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36935 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
36936 for (unsigned i = 0; i != NumMaskElts; ++i)
36937 if (Mask[i] == SM_SentinelZero)
36938 Mask[i] = NumMaskElts + i;
36939 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36940 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
36941 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36942 return DAG.getBitcast(RootVT, Res);
36943 }
36944
36945 // If that failed and either input is extracted then try to combine as a
36946 // shuffle with the larger type.
36947 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
36948 Inputs, Root, BaseMask, Depth, HasVariableMask,
36949 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
36950 Subtarget))
36951 return WideShuffle;
36952
36953 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
36954 // (non-VLX will pad to 512-bit shuffles).
36955 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
36956 ((Subtarget.hasAVX512() &&
36957 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
36958 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
36959 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
36960 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
36961 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
36962 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
36963 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
36964 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
36965 V1 = CanonicalizeShuffleInput(MaskVT, V1);
36966 V2 = CanonicalizeShuffleInput(MaskVT, V2);
36967 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
36968 return DAG.getBitcast(RootVT, Res);
36969 }
36970 return SDValue();
36971 }
36972
36973 // See if we can combine a single input shuffle with zeros to a bit-mask,
36974 // which is much simpler than any shuffle.
36975 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
36976 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
36977 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
36978 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
36979 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
36980 APInt UndefElts(NumMaskElts, 0);
36981 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
36982 for (unsigned i = 0; i != NumMaskElts; ++i) {
36983 int M = Mask[i];
36984 if (M == SM_SentinelUndef) {
36985 UndefElts.setBit(i);
36986 continue;
36987 }
36988 if (M == SM_SentinelZero)
36989 continue;
36990 EltBits[i] = AllOnes;
36991 }
36992 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
36993 Res = CanonicalizeShuffleInput(MaskVT, V1);
36994 unsigned AndOpcode =
36995 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
36996 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
36997 return DAG.getBitcast(RootVT, Res);
36998 }
36999
37000 // If we have a single input shuffle with different shuffle patterns in the
37001 // the 128-bit lanes use the variable mask to VPERMILPS.
37002 // TODO Combine other mask types at higher depths.
37003 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
37004 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
37005 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
37006 SmallVector<SDValue, 16> VPermIdx;
37007 for (int M : Mask) {
37008 SDValue Idx =
37009 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
37010 VPermIdx.push_back(Idx);
37011 }
37012 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
37013 Res = CanonicalizeShuffleInput(MaskVT, V1);
37014 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
37015 return DAG.getBitcast(RootVT, Res);
37016 }
37017
37018 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
37019 // to VPERMIL2PD/VPERMIL2PS.
37020 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
37021 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
37022 MaskVT == MVT::v8f32)) {
37023 // VPERMIL2 Operation.
37024 // Bits[3] - Match Bit.
37025 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
37026 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
37027 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
37028 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
37029 SmallVector<int, 8> VPerm2Idx;
37030 unsigned M2ZImm = 0;
37031 for (int M : Mask) {
37032 if (M == SM_SentinelUndef) {
37033 VPerm2Idx.push_back(-1);
37034 continue;
37035 }
37036 if (M == SM_SentinelZero) {
37037 M2ZImm = 2;
37038 VPerm2Idx.push_back(8);
37039 continue;
37040 }
37041 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
37042 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
37043 VPerm2Idx.push_back(Index);
37044 }
37045 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37046 V2 = CanonicalizeShuffleInput(MaskVT, V2);
37047 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
37048 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
37049 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
37050 return DAG.getBitcast(RootVT, Res);
37051 }
37052
37053 // If we have 3 or more shuffle instructions or a chain involving a variable
37054 // mask, we can replace them with a single PSHUFB instruction profitably.
37055 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
37056 // instructions, but in practice PSHUFB tends to be *very* fast so we're
37057 // more aggressive.
37058 if (UnaryShuffle && AllowVariablePerLaneMask &&
37059 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
37060 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
37061 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
37062 SmallVector<SDValue, 16> PSHUFBMask;
37063 int NumBytes = RootVT.getSizeInBits() / 8;
37064 int Ratio = NumBytes / NumMaskElts;
37065 for (int i = 0; i < NumBytes; ++i) {
37066 int M = Mask[i / Ratio];
37067 if (M == SM_SentinelUndef) {
37068 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
37069 continue;
37070 }
37071 if (M == SM_SentinelZero) {
37072 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
37073 continue;
37074 }
37075 M = Ratio * M + i % Ratio;
37076 assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37076, __extension__ __PRETTY_FUNCTION__))
;
37077 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
37078 }
37079 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
37080 Res = CanonicalizeShuffleInput(ByteVT, V1);
37081 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
37082 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
37083 return DAG.getBitcast(RootVT, Res);
37084 }
37085
37086 // With XOP, if we have a 128-bit binary input shuffle we can always combine
37087 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
37088 // slower than PSHUFB on targets that support both.
37089 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
37090 Subtarget.hasXOP()) {
37091 // VPPERM Mask Operation
37092 // Bits[4:0] - Byte Index (0 - 31)
37093 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
37094 SmallVector<SDValue, 16> VPPERMMask;
37095 int NumBytes = 16;
37096 int Ratio = NumBytes / NumMaskElts;
37097 for (int i = 0; i < NumBytes; ++i) {
37098 int M = Mask[i / Ratio];
37099 if (M == SM_SentinelUndef) {
37100 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
37101 continue;
37102 }
37103 if (M == SM_SentinelZero) {
37104 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
37105 continue;
37106 }
37107 M = Ratio * M + i % Ratio;
37108 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
37109 }
37110 MVT ByteVT = MVT::v16i8;
37111 V1 = CanonicalizeShuffleInput(ByteVT, V1);
37112 V2 = CanonicalizeShuffleInput(ByteVT, V2);
37113 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
37114 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
37115 return DAG.getBitcast(RootVT, Res);
37116 }
37117
37118 // If that failed and either input is extracted then try to combine as a
37119 // shuffle with the larger type.
37120 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
37121 Inputs, Root, BaseMask, Depth, HasVariableMask,
37122 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
37123 return WideShuffle;
37124
37125 // If we have a dual input shuffle then lower to VPERMV3,
37126 // (non-VLX will pad to 512-bit shuffles)
37127 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
37128 ((Subtarget.hasAVX512() &&
37129 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
37130 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
37131 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
37132 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
37133 MaskVT == MVT::v16i32)) ||
37134 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
37135 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
37136 MaskVT == MVT::v32i16)) ||
37137 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
37138 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
37139 MaskVT == MVT::v64i8)))) {
37140 V1 = CanonicalizeShuffleInput(MaskVT, V1);
37141 V2 = CanonicalizeShuffleInput(MaskVT, V2);
37142 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
37143 return DAG.getBitcast(RootVT, Res);
37144 }
37145
37146 // Failed to find any combines.
37147 return SDValue();
37148}
37149
37150// Combine an arbitrary chain of shuffles + extract_subvectors into a single
37151// instruction if possible.
37152//
37153// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
37154// type size to attempt to combine:
37155// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
37156// -->
37157// extract_subvector(shuffle(x,y,m2),0)
37158static SDValue combineX86ShuffleChainWithExtract(
37159 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
37160 bool HasVariableMask, bool AllowVariableCrossLaneMask,
37161 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
37162 const X86Subtarget &Subtarget) {
37163 unsigned NumMaskElts = BaseMask.size();
37164 unsigned NumInputs = Inputs.size();
37165 if (NumInputs == 0)
37166 return SDValue();
37167
37168 EVT RootVT = Root.getValueType();
37169 unsigned RootSizeInBits = RootVT.getSizeInBits();
37170 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37170, __extension__ __PRETTY_FUNCTION__))
;
37171
37172 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
37173 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
37174
37175 // Peek through subvectors.
37176 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
37177 unsigned WideSizeInBits = RootSizeInBits;
37178 for (unsigned i = 0; i != NumInputs; ++i) {
37179 SDValue &Src = WideInputs[i];
37180 unsigned &Offset = Offsets[i];
37181 Src = peekThroughBitcasts(Src);
37182 EVT BaseVT = Src.getValueType();
37183 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
37184 Offset += Src.getConstantOperandVal(1);
37185 Src = Src.getOperand(0);
37186 }
37187 WideSizeInBits = std::max(WideSizeInBits,
37188 (unsigned)Src.getValueSizeInBits());
37189 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37190, __extension__ __PRETTY_FUNCTION__))
37190 "Unexpected subvector extraction")(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37190, __extension__ __PRETTY_FUNCTION__))
;
37191 Offset /= BaseVT.getVectorNumElements();
37192 Offset *= NumMaskElts;
37193 }
37194
37195 // Bail if we're always extracting from the lowest subvectors,
37196 // combineX86ShuffleChain should match this for the current width.
37197 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
37198 return SDValue();
37199
37200 unsigned Scale = WideSizeInBits / RootSizeInBits;
37201 assert((WideSizeInBits % RootSizeInBits) == 0 &&(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37202, __extension__ __PRETTY_FUNCTION__))
37202 "Unexpected subvector extraction")(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37202, __extension__ __PRETTY_FUNCTION__))
;
37203
37204 // If the src vector types aren't the same, see if we can extend
37205 // them to match each other.
37206 // TODO: Support different scalar types?
37207 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
37208 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
37209 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
37210 Op.getValueType().getScalarType() != WideSVT;
37211 }))
37212 return SDValue();
37213
37214 for (SDValue &NewInput : WideInputs) {
37215 assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&(static_cast <bool> ((WideSizeInBits % NewInput.getValueSizeInBits
()) == 0 && "Shuffle vector size mismatch") ? void (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37216, __extension__ __PRETTY_FUNCTION__))
37216 "Shuffle vector size mismatch")(static_cast <bool> ((WideSizeInBits % NewInput.getValueSizeInBits
()) == 0 && "Shuffle vector size mismatch") ? void (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37216, __extension__ __PRETTY_FUNCTION__))
;
37217 if (WideSizeInBits > NewInput.getValueSizeInBits())
37218 NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
37219 SDLoc(NewInput), WideSizeInBits);
37220 assert(WideSizeInBits == NewInput.getValueSizeInBits() &&(static_cast <bool> (WideSizeInBits == NewInput.getValueSizeInBits
() && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37221, __extension__ __PRETTY_FUNCTION__))
37221 "Unexpected subvector extraction")(static_cast <bool> (WideSizeInBits == NewInput.getValueSizeInBits
() && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37221, __extension__ __PRETTY_FUNCTION__))
;
37222 }
37223
37224 // Create new mask for larger type.
37225 for (unsigned i = 1; i != NumInputs; ++i)
37226 Offsets[i] += i * Scale * NumMaskElts;
37227
37228 SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
37229 for (int &M : WideMask) {
37230 if (M < 0)
37231 continue;
37232 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
37233 }
37234 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
37235
37236 // Remove unused/repeated shuffle source ops.
37237 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
37238 assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37238, __extension__ __PRETTY_FUNCTION__))
;
37239
37240 if (WideInputs.size() > 2)
37241 return SDValue();
37242
37243 // Increase depth for every upper subvector we've peeked through.
37244 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
37245
37246 // Attempt to combine wider chain.
37247 // TODO: Can we use a better Root?
37248 SDValue WideRoot = WideInputs[0];
37249 if (SDValue WideShuffle =
37250 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
37251 HasVariableMask, AllowVariableCrossLaneMask,
37252 AllowVariablePerLaneMask, DAG, Subtarget)) {
37253 WideShuffle =
37254 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
37255 return DAG.getBitcast(RootVT, WideShuffle);
37256 }
37257 return SDValue();
37258}
37259
37260// Canonicalize the combined shuffle mask chain with horizontal ops.
37261// NOTE: This may update the Ops and Mask.
37262static SDValue canonicalizeShuffleMaskWithHorizOp(
37263 MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
37264 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
37265 const X86Subtarget &Subtarget) {
37266 if (Mask.empty() || Ops.empty())
37267 return SDValue();
37268
37269 SmallVector<SDValue> BC;
37270 for (SDValue Op : Ops)
37271 BC.push_back(peekThroughBitcasts(Op));
37272
37273 // All ops must be the same horizop + type.
37274 SDValue BC0 = BC[0];
37275 EVT VT0 = BC0.getValueType();
37276 unsigned Opcode0 = BC0.getOpcode();
37277 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
37278 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
37279 }))
37280 return SDValue();
37281
37282 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
37283 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
37284 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
37285 if (!isHoriz && !isPack)
37286 return SDValue();
37287
37288 // Do all ops have a single use?
37289 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
37290 return Op.hasOneUse() &&
37291 peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);
37292 });
37293
37294 int NumElts = VT0.getVectorNumElements();
37295 int NumLanes = VT0.getSizeInBits() / 128;
37296 int NumEltsPerLane = NumElts / NumLanes;
37297 int NumHalfEltsPerLane = NumEltsPerLane / 2;
37298 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
37299 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
37300
37301 if (NumEltsPerLane >= 4 &&
37302 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
37303 SmallVector<int> LaneMask, ScaledMask;
37304 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
37305 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
37306 // See if we can remove the shuffle by resorting the HOP chain so that
37307 // the HOP args are pre-shuffled.
37308 // TODO: Generalize to any sized/depth chain.
37309 // TODO: Add support for PACKSS/PACKUS.
37310 if (isHoriz) {
37311 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
37312 auto GetHOpSrc = [&](int M) {
37313 if (M == SM_SentinelUndef)
37314 return DAG.getUNDEF(VT0);
37315 if (M == SM_SentinelZero)
37316 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
37317 SDValue Src0 = BC[M / 4];
37318 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
37319 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
37320 return Src1.getOperand(M % 2);
37321 return SDValue();
37322 };
37323 SDValue M0 = GetHOpSrc(ScaledMask[0]);
37324 SDValue M1 = GetHOpSrc(ScaledMask[1]);
37325 SDValue M2 = GetHOpSrc(ScaledMask[2]);
37326 SDValue M3 = GetHOpSrc(ScaledMask[3]);
37327 if (M0 && M1 && M2 && M3) {
37328 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
37329 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
37330 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
37331 }
37332 }
37333 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
37334 if (Ops.size() >= 2) {
37335 SDValue LHS, RHS;
37336 auto GetHOpSrc = [&](int M, int &OutM) {
37337 // TODO: Support SM_SentinelZero
37338 if (M < 0)
37339 return M == SM_SentinelUndef;
37340 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
37341 if (!LHS || LHS == Src) {
37342 LHS = Src;
37343 OutM = (M % 2);
37344 return true;
37345 }
37346 if (!RHS || RHS == Src) {
37347 RHS = Src;
37348 OutM = (M % 2) + 2;
37349 return true;
37350 }
37351 return false;
37352 };
37353 int PostMask[4] = {-1, -1, -1, -1};
37354 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
37355 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
37356 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
37357 GetHOpSrc(ScaledMask[3], PostMask[3])) {
37358 LHS = DAG.getBitcast(SrcVT, LHS);
37359 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
37360 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
37361 // Use SHUFPS for the permute so this will work on SSE3 targets,
37362 // shuffle combining and domain handling will simplify this later on.
37363 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
37364 Res = DAG.getBitcast(ShuffleVT, Res);
37365 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
37366 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
37367 }
37368 }
37369 }
37370 }
37371
37372 if (2 < Ops.size())
37373 return SDValue();
37374
37375 SDValue BC1 = BC[BC.size() - 1];
37376 if (Mask.size() == VT0.getVectorNumElements()) {
37377 // Canonicalize binary shuffles of horizontal ops that use the
37378 // same sources to an unary shuffle.
37379 // TODO: Try to perform this fold even if the shuffle remains.
37380 if (Ops.size() == 2) {
37381 auto ContainsOps = [](SDValue HOp, SDValue Op) {
37382 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
37383 };
37384 // Commute if all BC0's ops are contained in BC1.
37385 if (ContainsOps(BC1, BC0.getOperand(0)) &&
37386 ContainsOps(BC1, BC0.getOperand(1))) {
37387 ShuffleVectorSDNode::commuteMask(Mask);
37388 std::swap(Ops[0], Ops[1]);
37389 std::swap(BC0, BC1);
37390 }
37391
37392 // If BC1 can be represented by BC0, then convert to unary shuffle.
37393 if (ContainsOps(BC0, BC1.getOperand(0)) &&
37394 ContainsOps(BC0, BC1.getOperand(1))) {
37395 for (int &M : Mask) {
37396 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
37397 continue;
37398 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
37399 M -= NumElts + (SubLane * NumHalfEltsPerLane);
37400 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
37401 M += NumHalfEltsPerLane;
37402 }
37403 }
37404 }
37405
37406 // Canonicalize unary horizontal ops to only refer to lower halves.
37407 for (int i = 0; i != NumElts; ++i) {
37408 int &M = Mask[i];
37409 if (isUndefOrZero(M))
37410 continue;
37411 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
37412 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
37413 M -= NumHalfEltsPerLane;
37414 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
37415 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
37416 M -= NumHalfEltsPerLane;
37417 }
37418 }
37419
37420 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
37421 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
37422 // represents the LHS/RHS inputs for the lower/upper halves.
37423 SmallVector<int, 16> TargetMask128, WideMask128;
37424 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
37425 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
37426 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37426, __extension__ __PRETTY_FUNCTION__))
;
37427 bool SingleOp = (Ops.size() == 1);
37428 if (isPack || OneUseOps ||
37429 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
37430 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
37431 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
37432 Lo = Lo.getOperand(WideMask128[0] & 1);
37433 Hi = Hi.getOperand(WideMask128[1] & 1);
37434 if (SingleOp) {
37435 SDValue Undef = DAG.getUNDEF(SrcVT);
37436 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
37437 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
37438 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
37439 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
37440 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
37441 }
37442 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
37443 }
37444 }
37445
37446 return SDValue();
37447}
37448
37449// Attempt to constant fold all of the constant source ops.
37450// Returns true if the entire shuffle is folded to a constant.
37451// TODO: Extend this to merge multiple constant Ops and update the mask.
37452static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
37453 ArrayRef<int> Mask, SDValue Root,
37454 bool HasVariableMask,
37455 SelectionDAG &DAG,
37456 const X86Subtarget &Subtarget) {
37457 MVT VT = Root.getSimpleValueType();
37458
37459 unsigned SizeInBits = VT.getSizeInBits();
37460 unsigned NumMaskElts = Mask.size();
37461 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
37462 unsigned NumOps = Ops.size();
37463
37464 // Extract constant bits from each source op.
37465 bool OneUseConstantOp = false;
37466 SmallVector<APInt, 16> UndefEltsOps(NumOps);
37467 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
37468 for (unsigned i = 0; i != NumOps; ++i) {
37469 SDValue SrcOp = Ops[i];
37470 OneUseConstantOp |= SrcOp.hasOneUse();
37471 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
37472 RawBitsOps[i]))
37473 return SDValue();
37474 }
37475
37476 // Only fold if at least one of the constants is only used once or
37477 // the combined shuffle has included a variable mask shuffle, this
37478 // is to avoid constant pool bloat.
37479 if (!OneUseConstantOp && !HasVariableMask)
37480 return SDValue();
37481
37482 // Shuffle the constant bits according to the mask.
37483 SDLoc DL(Root);
37484 APInt UndefElts(NumMaskElts, 0);
37485 APInt ZeroElts(NumMaskElts, 0);
37486 APInt ConstantElts(NumMaskElts, 0);
37487 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
37488 APInt::getNullValue(MaskSizeInBits));
37489 for (unsigned i = 0; i != NumMaskElts; ++i) {
37490 int M = Mask[i];
37491 if (M == SM_SentinelUndef) {
37492 UndefElts.setBit(i);
37493 continue;
37494 } else if (M == SM_SentinelZero) {
37495 ZeroElts.setBit(i);
37496 continue;
37497 }
37498 assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37498, __extension__ __PRETTY_FUNCTION__))
;
37499
37500 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
37501 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
37502
37503 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
37504 if (SrcUndefElts[SrcMaskIdx]) {
37505 UndefElts.setBit(i);
37506 continue;
37507 }
37508
37509 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
37510 APInt &Bits = SrcEltBits[SrcMaskIdx];
37511 if (!Bits) {
37512 ZeroElts.setBit(i);
37513 continue;
37514 }
37515
37516 ConstantElts.setBit(i);
37517 ConstantBitData[i] = Bits;
37518 }
37519 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts
).isAllOnesValue()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnesValue()"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37519, __extension__ __PRETTY_FUNCTION__))
;
37520
37521 // Attempt to create a zero vector.
37522 if ((UndefElts | ZeroElts).isAllOnesValue())
37523 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
37524
37525 // Create the constant data.
37526 MVT MaskSVT;
37527 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
37528 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
37529 else
37530 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
37531
37532 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
37533 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
37534 return SDValue();
37535
37536 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
37537 return DAG.getBitcast(VT, CstOp);
37538}
37539
37540namespace llvm {
37541 namespace X86 {
37542 enum {
37543 MaxShuffleCombineDepth = 8
37544 };
37545 }
37546} // namespace llvm
37547
37548/// Fully generic combining of x86 shuffle instructions.
37549///
37550/// This should be the last combine run over the x86 shuffle instructions. Once
37551/// they have been fully optimized, this will recursively consider all chains
37552/// of single-use shuffle instructions, build a generic model of the cumulative
37553/// shuffle operation, and check for simpler instructions which implement this
37554/// operation. We use this primarily for two purposes:
37555///
37556/// 1) Collapse generic shuffles to specialized single instructions when
37557/// equivalent. In most cases, this is just an encoding size win, but
37558/// sometimes we will collapse multiple generic shuffles into a single
37559/// special-purpose shuffle.
37560/// 2) Look for sequences of shuffle instructions with 3 or more total
37561/// instructions, and replace them with the slightly more expensive SSSE3
37562/// PSHUFB instruction if available. We do this as the last combining step
37563/// to ensure we avoid using PSHUFB if we can implement the shuffle with
37564/// a suitable short sequence of other instructions. The PSHUFB will either
37565/// use a register or have to read from memory and so is slightly (but only
37566/// slightly) more expensive than the other shuffle instructions.
37567///
37568/// Because this is inherently a quadratic operation (for each shuffle in
37569/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
37570/// This should never be an issue in practice as the shuffle lowering doesn't
37571/// produce sequences of more than 8 instructions.
37572///
37573/// FIXME: We will currently miss some cases where the redundant shuffling
37574/// would simplify under the threshold for PSHUFB formation because of
37575/// combine-ordering. To fix this, we should do the redundant instruction
37576/// combining in this recursive walk.
37577static SDValue combineX86ShufflesRecursively(
37578 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
37579 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
37580 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
37581 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
37582 const X86Subtarget &Subtarget) {
37583 assert(RootMask.size() > 0 &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37585, __extension__ __PRETTY_FUNCTION__))
37584 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37585, __extension__ __PRETTY_FUNCTION__))
37585 "Illegal shuffle root mask")(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37585, __extension__ __PRETTY_FUNCTION__))
;
37586 assert(Root.getSimpleValueType().isVector() &&(static_cast <bool> (Root.getSimpleValueType().isVector
() && "Shuffles operate on vector types!") ? void (0)
: __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37587, __extension__ __PRETTY_FUNCTION__))
37587 "Shuffles operate on vector types!")(static_cast <bool> (Root.getSimpleValueType().isVector
() && "Shuffles operate on vector types!") ? void (0)
: __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37587, __extension__ __PRETTY_FUNCTION__))
;
37588 unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
37589
37590 // Bound the depth of our recursive combine because this is ultimately
37591 // quadratic in nature.
37592 if (Depth >= MaxDepth)
37593 return SDValue();
37594
37595 // Directly rip through bitcasts to find the underlying operand.
37596 SDValue Op = SrcOps[SrcOpIndex];
37597 Op = peekThroughOneUseBitcasts(Op);
37598
37599 EVT VT = Op.getValueType();
37600 if (!VT.isVector() || !VT.isSimple())
37601 return SDValue(); // Bail if we hit a non-simple non-vector.
37602
37603 // FIXME: Just bail on f16 for now.
37604 if (VT.getVectorElementType() == MVT::f16)
37605 return SDValue();
37606
37607 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37608, __extension__ __PRETTY_FUNCTION__))
37608 "Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37608, __extension__ __PRETTY_FUNCTION__))
;
37609
37610 // Extract target shuffle mask and resolve sentinels and inputs.
37611 // TODO - determine Op's demanded elts from RootMask.
37612 SmallVector<int, 64> OpMask;
37613 SmallVector<SDValue, 2> OpInputs;
37614 APInt OpUndef, OpZero;
37615 APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
37616 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
37617 if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
37618 OpZero, DAG, Depth, false))
37619 return SDValue();
37620
37621 // Shuffle inputs must not be larger than the shuffle result.
37622 // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
37623 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
37624 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
37625 }))
37626 return SDValue();
37627
37628 // If the shuffle result was smaller than the root, we need to adjust the
37629 // mask indices and pad the mask with undefs.
37630 if (RootSizeInBits > VT.getSizeInBits()) {
37631 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
37632 unsigned OpMaskSize = OpMask.size();
37633 if (OpInputs.size() > 1) {
37634 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
37635 for (int &M : OpMask) {
37636 if (M < 0)
37637 continue;
37638 int EltIdx = M % OpMaskSize;
37639 int OpIdx = M / OpMaskSize;
37640 M = (PaddedMaskSize * OpIdx) + EltIdx;
37641 }
37642 }
37643 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
37644 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
37645 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
37646 }
37647
37648 SmallVector<int, 64> Mask;
37649 SmallVector<SDValue, 16> Ops;
37650
37651 // We don't need to merge masks if the root is empty.
37652 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
37653 if (EmptyRoot) {
37654 // Only resolve zeros if it will remove an input, otherwise we might end
37655 // up in an infinite loop.
37656 bool ResolveKnownZeros = true;
37657 if (!OpZero.isNullValue()) {
37658 APInt UsedInputs = APInt::getNullValue(OpInputs.size());
37659 for (int i = 0, e = OpMask.size(); i != e; ++i) {
37660 int M = OpMask[i];
37661 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
37662 continue;
37663 UsedInputs.setBit(M / OpMask.size());
37664 if (UsedInputs.isAllOnesValue()) {
37665 ResolveKnownZeros = false;
37666 break;
37667 }
37668 }
37669 }
37670 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
37671 ResolveKnownZeros);
37672
37673 Mask = OpMask;
37674 Ops.append(OpInputs.begin(), OpInputs.end());
37675 } else {
37676 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
37677
37678 // Add the inputs to the Ops list, avoiding duplicates.
37679 Ops.append(SrcOps.begin(), SrcOps.end());
37680
37681 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
37682 // Attempt to find an existing match.
37683 SDValue InputBC = peekThroughBitcasts(Input);
37684 for (int i = 0, e = Ops.size(); i < e; ++i)
37685 if (InputBC == peekThroughBitcasts(Ops[i]))
37686 return i;
37687 // Match failed - should we replace an existing Op?
37688 if (InsertionPoint >= 0) {
37689 Ops[InsertionPoint] = Input;
37690 return InsertionPoint;
37691 }
37692 // Add to the end of the Ops list.
37693 Ops.push_back(Input);
37694 return Ops.size() - 1;
37695 };
37696
37697 SmallVector<int, 2> OpInputIdx;
37698 for (SDValue OpInput : OpInputs)
37699 OpInputIdx.push_back(
37700 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
37701
37702 assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37707, __extension__ __PRETTY_FUNCTION__))
37703 RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37707, __extension__ __PRETTY_FUNCTION__))
37704 (OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37707, __extension__ __PRETTY_FUNCTION__))
37705 OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37707, __extension__ __PRETTY_FUNCTION__))
37706 OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37707, __extension__ __PRETTY_FUNCTION__))
37707 "The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37707, __extension__ __PRETTY_FUNCTION__))
;
37708
37709 // This function can be performance-critical, so we rely on the power-of-2
37710 // knowledge that we have about the mask sizes to replace div/rem ops with
37711 // bit-masks and shifts.
37712 assert(isPowerOf2_32(RootMask.size()) &&(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37713, __extension__ __PRETTY_FUNCTION__))
37713 "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37713, __extension__ __PRETTY_FUNCTION__))
;
37714 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37714, __extension__ __PRETTY_FUNCTION__))
;
37715 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
37716 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
37717
37718 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
37719 unsigned RootRatio =
37720 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
37721 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
37722 assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37723, __extension__ __PRETTY_FUNCTION__))
37723 "Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37723, __extension__ __PRETTY_FUNCTION__))
;
37724
37725 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37725, __extension__ __PRETTY_FUNCTION__))
;
37726 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37726, __extension__ __PRETTY_FUNCTION__))
;
37727 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37727, __extension__ __PRETTY_FUNCTION__))
;
37728 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
37729 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
37730
37731 Mask.resize(MaskWidth, SM_SentinelUndef);
37732
37733 // Merge this shuffle operation's mask into our accumulated mask. Note that
37734 // this shuffle's mask will be the first applied to the input, followed by
37735 // the root mask to get us all the way to the root value arrangement. The
37736 // reason for this order is that we are recursing up the operation chain.
37737 for (unsigned i = 0; i < MaskWidth; ++i) {
37738 unsigned RootIdx = i >> RootRatioLog2;
37739 if (RootMask[RootIdx] < 0) {
37740 // This is a zero or undef lane, we're done.
37741 Mask[i] = RootMask[RootIdx];
37742 continue;
37743 }
37744
37745 unsigned RootMaskedIdx =
37746 RootRatio == 1
37747 ? RootMask[RootIdx]
37748 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
37749
37750 // Just insert the scaled root mask value if it references an input other
37751 // than the SrcOp we're currently inserting.
37752 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
37753 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
37754 Mask[i] = RootMaskedIdx;
37755 continue;
37756 }
37757
37758 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
37759 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
37760 if (OpMask[OpIdx] < 0) {
37761 // The incoming lanes are zero or undef, it doesn't matter which ones we
37762 // are using.
37763 Mask[i] = OpMask[OpIdx];
37764 continue;
37765 }
37766
37767 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
37768 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
37769 : (OpMask[OpIdx] << OpRatioLog2) +
37770 (RootMaskedIdx & (OpRatio - 1));
37771
37772 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
37773 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
37774 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37774, __extension__ __PRETTY_FUNCTION__))
;
37775 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
37776
37777 Mask[i] = OpMaskedIdx;
37778 }
37779 }
37780
37781 // Remove unused/repeated shuffle source ops.
37782 resolveTargetShuffleInputsAndMask(Ops, Mask);
37783
37784 // Handle the all undef/zero/ones cases early.
37785 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
37786 return DAG.getUNDEF(Root.getValueType());
37787 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
37788 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
37789 SDLoc(Root));
37790 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
37791 none_of(Mask, [](int M) { return M == SM_SentinelZero; }))
37792 return getOnesVector(Root.getValueType(), DAG, SDLoc(Root));
37793
37794 assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37794, __extension__ __PRETTY_FUNCTION__))
;
37795 HasVariableMask |= IsOpVariableMask;
37796
37797 // Update the list of shuffle nodes that have been combined so far.
37798 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
37799 SrcNodes.end());
37800 CombinedNodes.push_back(Op.getNode());
37801
37802 // See if we can recurse into each shuffle source op (if it's a target
37803 // shuffle). The source op should only be generally combined if it either has
37804 // a single use (i.e. current Op) or all its users have already been combined,
37805 // if not then we can still combine but should prevent generation of variable
37806 // shuffles to avoid constant pool bloat.
37807 // Don't recurse if we already have more source ops than we can combine in
37808 // the remaining recursion depth.
37809 if (Ops.size() < (MaxDepth - Depth)) {
37810 for (int i = 0, e = Ops.size(); i < e; ++i) {
37811 // For empty roots, we need to resolve zeroable elements before combining
37812 // them with other shuffles.
37813 SmallVector<int, 64> ResolvedMask = Mask;
37814 if (EmptyRoot)
37815 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
37816 bool AllowCrossLaneVar = false;
37817 bool AllowPerLaneVar = false;
37818 if (Ops[i].getNode()->hasOneUse() ||
37819 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
37820 AllowCrossLaneVar = AllowVariableCrossLaneMask;
37821 AllowPerLaneVar = AllowVariablePerLaneMask;
37822 }
37823 if (SDValue Res = combineX86ShufflesRecursively(
37824 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
37825 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
37826 Subtarget))
37827 return Res;
37828 }
37829 }
37830
37831 // Attempt to constant fold all of the constant source ops.
37832 if (SDValue Cst = combineX86ShufflesConstants(
37833 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
37834 return Cst;
37835
37836 // If constant fold failed and we only have constants - then we have
37837 // multiple uses by a single non-variable shuffle - just bail.
37838 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
37839 APInt UndefElts;
37840 SmallVector<APInt> RawBits;
37841 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
37842 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
37843 RawBits);
37844 })) {
37845 return SDValue();
37846 }
37847
37848 // Canonicalize the combined shuffle mask chain with horizontal ops.
37849 // NOTE: This will update the Ops and Mask.
37850 if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
37851 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
37852 return DAG.getBitcast(Root.getValueType(), HOp);
37853
37854 // Widen any subvector shuffle inputs we've collected.
37855 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
37856 return Op.getValueSizeInBits() < RootSizeInBits;
37857 })) {
37858 for (SDValue &Op : Ops)
37859 if (Op.getValueSizeInBits() < RootSizeInBits)
37860 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
37861 RootSizeInBits);
37862 // Reresolve - we might have repeated subvector sources.
37863 resolveTargetShuffleInputsAndMask(Ops, Mask);
37864 }
37865
37866 // We can only combine unary and binary shuffle mask cases.
37867 if (Ops.size() <= 2) {
37868 // Minor canonicalization of the accumulated shuffle mask to make it easier
37869 // to match below. All this does is detect masks with sequential pairs of
37870 // elements, and shrink them to the half-width mask. It does this in a loop
37871 // so it will reduce the size of the mask to the minimal width mask which
37872 // performs an equivalent shuffle.
37873 while (Mask.size() > 1) {
37874 SmallVector<int, 64> WidenedMask;
37875 if (!canWidenShuffleElements(Mask, WidenedMask))
37876 break;
37877 Mask = std::move(WidenedMask);
37878 }
37879
37880 // Canonicalization of binary shuffle masks to improve pattern matching by
37881 // commuting the inputs.
37882 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
37883 ShuffleVectorSDNode::commuteMask(Mask);
37884 std::swap(Ops[0], Ops[1]);
37885 }
37886
37887 // Finally, try to combine into a single shuffle instruction.
37888 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
37889 AllowVariableCrossLaneMask,
37890 AllowVariablePerLaneMask, DAG, Subtarget);
37891 }
37892
37893 // If that failed and any input is extracted then try to combine as a
37894 // shuffle with the larger type.
37895 return combineX86ShuffleChainWithExtract(
37896 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
37897 AllowVariablePerLaneMask, DAG, Subtarget);
37898}
37899
37900/// Helper entry wrapper to combineX86ShufflesRecursively.
37901static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
37902 const X86Subtarget &Subtarget) {
37903 return combineX86ShufflesRecursively(
37904 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
37905 /*HasVarMask*/ false,
37906 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
37907 Subtarget);
37908}
37909
37910/// Get the PSHUF-style mask from PSHUF node.
37911///
37912/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
37913/// PSHUF-style masks that can be reused with such instructions.
37914static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
37915 MVT VT = N.getSimpleValueType();
37916 SmallVector<int, 4> Mask;
37917 SmallVector<SDValue, 2> Ops;
37918 bool HaveMask =
37919 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);
37920 (void)HaveMask;
37921 assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37921, __extension__ __PRETTY_FUNCTION__))
;
37922
37923 // If we have more than 128-bits, only the low 128-bits of shuffle mask
37924 // matter. Check that the upper masks are repeats and remove them.
37925 if (VT.getSizeInBits() > 128) {
37926 int LaneElts = 128 / VT.getScalarSizeInBits();
37927#ifndef NDEBUG
37928 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
37929 for (int j = 0; j < LaneElts; ++j)
37930 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37931, __extension__ __PRETTY_FUNCTION__))
37931 "Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37931, __extension__ __PRETTY_FUNCTION__))
;
37932#endif
37933 Mask.resize(LaneElts);
37934 }
37935
37936 switch (N.getOpcode()) {
37937 case X86ISD::PSHUFD:
37938 return Mask;
37939 case X86ISD::PSHUFLW:
37940 Mask.resize(4);
37941 return Mask;
37942 case X86ISD::PSHUFHW:
37943 Mask.erase(Mask.begin(), Mask.begin() + 4);
37944 for (int &M : Mask)
37945 M -= 4;
37946 return Mask;
37947 default:
37948 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37948)
;
37949 }
37950}
37951
37952/// Search for a combinable shuffle across a chain ending in pshufd.
37953///
37954/// We walk up the chain and look for a combinable shuffle, skipping over
37955/// shuffles that we could hoist this shuffle's transformation past without
37956/// altering anything.
37957static SDValue
37958combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
37959 SelectionDAG &DAG) {
37960 assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37961, __extension__ __PRETTY_FUNCTION__))
37961 "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37961, __extension__ __PRETTY_FUNCTION__))
;
37962 SDLoc DL(N);
37963
37964 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
37965 // of the shuffles in the chain so that we can form a fresh chain to replace
37966 // this one.
37967 SmallVector<SDValue, 8> Chain;
37968 SDValue V = N.getOperand(0);
37969 for (; V.hasOneUse(); V = V.getOperand(0)) {
37970 switch (V.getOpcode()) {
37971 default:
37972 return SDValue(); // Nothing combined!
37973
37974 case ISD::BITCAST:
37975 // Skip bitcasts as we always know the type for the target specific
37976 // instructions.
37977 continue;
37978
37979 case X86ISD::PSHUFD:
37980 // Found another dword shuffle.
37981 break;
37982
37983 case X86ISD::PSHUFLW:
37984 // Check that the low words (being shuffled) are the identity in the
37985 // dword shuffle, and the high words are self-contained.
37986 if (Mask[0] != 0 || Mask[1] != 1 ||
37987 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
37988 return SDValue();
37989
37990 Chain.push_back(V);
37991 continue;
37992
37993 case X86ISD::PSHUFHW:
37994 // Check that the high words (being shuffled) are the identity in the
37995 // dword shuffle, and the low words are self-contained.
37996 if (Mask[2] != 2 || Mask[3] != 3 ||
37997 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
37998 return SDValue();
37999
38000 Chain.push_back(V);
38001 continue;
38002
38003 case X86ISD::UNPCKL:
38004 case X86ISD::UNPCKH:
38005 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
38006 // shuffle into a preceding word shuffle.
38007 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
38008 V.getSimpleValueType().getVectorElementType() != MVT::i16)
38009 return SDValue();
38010
38011 // Search for a half-shuffle which we can combine with.
38012 unsigned CombineOp =
38013 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
38014 if (V.getOperand(0) != V.getOperand(1) ||
38015 !V->isOnlyUserOf(V.getOperand(0).getNode()))
38016 return SDValue();
38017 Chain.push_back(V);
38018 V = V.getOperand(0);
38019 do {
38020 switch (V.getOpcode()) {
38021 default:
38022 return SDValue(); // Nothing to combine.
38023
38024 case X86ISD::PSHUFLW:
38025 case X86ISD::PSHUFHW:
38026 if (V.getOpcode() == CombineOp)
38027 break;
38028
38029 Chain.push_back(V);
38030
38031 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38032 case ISD::BITCAST:
38033 V = V.getOperand(0);
38034 continue;
38035 }
38036 break;
38037 } while (V.hasOneUse());
38038 break;
38039 }
38040 // Break out of the loop if we break out of the switch.
38041 break;
38042 }
38043
38044 if (!V.hasOneUse())
38045 // We fell out of the loop without finding a viable combining instruction.
38046 return SDValue();
38047
38048 // Merge this node's mask and our incoming mask.
38049 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38050 for (int &M : Mask)
38051 M = VMask[M];
38052 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
38053 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
38054
38055 // Rebuild the chain around this new shuffle.
38056 while (!Chain.empty()) {
38057 SDValue W = Chain.pop_back_val();
38058
38059 if (V.getValueType() != W.getOperand(0).getValueType())
38060 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
38061
38062 switch (W.getOpcode()) {
38063 default:
38064 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38064)
;
38065
38066 case X86ISD::UNPCKL:
38067 case X86ISD::UNPCKH:
38068 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
38069 break;
38070
38071 case X86ISD::PSHUFD:
38072 case X86ISD::PSHUFLW:
38073 case X86ISD::PSHUFHW:
38074 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
38075 break;
38076 }
38077 }
38078 if (V.getValueType() != N.getValueType())
38079 V = DAG.getBitcast(N.getValueType(), V);
38080
38081 // Return the new chain to replace N.
38082 return V;
38083}
38084
38085// Attempt to commute shufps LHS loads:
38086// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
38087static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
38088 SelectionDAG &DAG) {
38089 // TODO: Add vXf64 support.
38090 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
38091 return SDValue();
38092
38093 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
38094 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
38095 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
38096 return SDValue();
38097 SDValue N0 = V.getOperand(0);
38098 SDValue N1 = V.getOperand(1);
38099 unsigned Imm = V.getConstantOperandVal(2);
38100 if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
38101 MayFoldLoad(peekThroughOneUseBitcasts(N1)))
38102 return SDValue();
38103 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
38104 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
38105 DAG.getTargetConstant(Imm, DL, MVT::i8));
38106 };
38107
38108 switch (N.getOpcode()) {
38109 case X86ISD::VPERMILPI:
38110 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
38111 unsigned Imm = N.getConstantOperandVal(1);
38112 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
38113 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
38114 }
38115 break;
38116 case X86ISD::SHUFP: {
38117 SDValue N0 = N.getOperand(0);
38118 SDValue N1 = N.getOperand(1);
38119 unsigned Imm = N.getConstantOperandVal(2);
38120 if (N0 == N1) {
38121 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
38122 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
38123 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
38124 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
38125 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
38126 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
38127 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
38128 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
38129 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
38130 }
38131 break;
38132 }
38133 }
38134
38135 return SDValue();
38136}
38137
38138// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
38139static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
38140 const SDLoc &DL) {
38141 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38142 EVT ShuffleVT = N.getValueType();
38143
38144 auto IsMergeableWithShuffle = [](SDValue Op) {
38145 // AllZeros/AllOnes constants are freely shuffled and will peek through
38146 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
38147 // merge with target shuffles if it has one use so shuffle combining is
38148 // likely to kick in.
38149 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
38150 ISD::isBuildVectorAllZeros(Op.getNode()) ||
38151 ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
38152 ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
38153 (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse());
38154 };
38155 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
38156 // Ensure we only shuffle whole vector src elements, unless its a logical
38157 // binops where we can more aggressively move shuffles from dst to src.
38158 return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||
38159 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
38160 };
38161
38162 unsigned Opc = N.getOpcode();
38163 switch (Opc) {
38164 // Unary and Unary+Permute Shuffles.
38165 case X86ISD::PSHUFB: {
38166 // Don't merge PSHUFB if it contains zero'd elements.
38167 SmallVector<int> Mask;
38168 SmallVector<SDValue> Ops;
38169 if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,
38170 Mask))
38171 break;
38172 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38173 }
38174 case X86ISD::VBROADCAST:
38175 case X86ISD::MOVDDUP:
38176 case X86ISD::PSHUFD:
38177 case X86ISD::VPERMI:
38178 case X86ISD::VPERMILPI: {
38179 if (N.getOperand(0).getValueType() == ShuffleVT &&
38180 N->isOnlyUserOf(N.getOperand(0).getNode())) {
38181 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
38182 unsigned SrcOpcode = N0.getOpcode();
38183 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
38184 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
38185 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
38186 if (IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op01)) {
38187 SDValue LHS, RHS;
38188 Op00 = DAG.getBitcast(ShuffleVT, Op00);
38189 Op01 = DAG.getBitcast(ShuffleVT, Op01);
38190 if (N.getNumOperands() == 2) {
38191 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
38192 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
38193 } else {
38194 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
38195 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
38196 }
38197 EVT OpVT = N0.getValueType();
38198 return DAG.getBitcast(ShuffleVT,
38199 DAG.getNode(SrcOpcode, DL, OpVT,
38200 DAG.getBitcast(OpVT, LHS),
38201 DAG.getBitcast(OpVT, RHS)));
38202 }
38203 }
38204 }
38205 break;
38206 }
38207 // Binary and Binary+Permute Shuffles.
38208 case X86ISD::INSERTPS: {
38209 // Don't merge INSERTPS if it contains zero'd elements.
38210 unsigned InsertPSMask = N.getConstantOperandVal(2);
38211 unsigned ZeroMask = InsertPSMask & 0xF;
38212 if (ZeroMask != 0)
38213 break;
38214 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38215 }
38216 case X86ISD::MOVSD:
38217 case X86ISD::MOVSS:
38218 case X86ISD::BLENDI:
38219 case X86ISD::SHUFP:
38220 case X86ISD::UNPCKH:
38221 case X86ISD::UNPCKL: {
38222 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
38223 N->isOnlyUserOf(N.getOperand(1).getNode())) {
38224 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
38225 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
38226 unsigned SrcOpcode = N0.getOpcode();
38227 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
38228 IsSafeToMoveShuffle(N0, SrcOpcode) &&
38229 IsSafeToMoveShuffle(N1, SrcOpcode)) {
38230 SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
38231 SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));
38232 SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
38233 SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));
38234 // Ensure the total number of shuffles doesn't increase by folding this
38235 // shuffle through to the source ops.
38236 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
38237 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
38238 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
38239 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
38240 SDValue LHS, RHS;
38241 Op00 = DAG.getBitcast(ShuffleVT, Op00);
38242 Op10 = DAG.getBitcast(ShuffleVT, Op10);
38243 Op01 = DAG.getBitcast(ShuffleVT, Op01);
38244 Op11 = DAG.getBitcast(ShuffleVT, Op11);
38245 if (N.getNumOperands() == 3) {
38246 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
38247 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
38248 } else {
38249 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
38250 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
38251 }
38252 EVT OpVT = N0.getValueType();
38253 return DAG.getBitcast(ShuffleVT,
38254 DAG.getNode(SrcOpcode, DL, OpVT,
38255 DAG.getBitcast(OpVT, LHS),
38256 DAG.getBitcast(OpVT, RHS)));
38257 }
38258 }
38259 }
38260 break;
38261 }
38262 }
38263 return SDValue();
38264}
38265
38266/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
38267static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
38268 SelectionDAG &DAG,
38269 const SDLoc &DL) {
38270 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38270, __extension__ __PRETTY_FUNCTION__))
;
38271
38272 MVT VT = V.getSimpleValueType();
38273 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
38274 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
38275 unsigned SrcOpc0 = Src0.getOpcode();
38276 unsigned SrcOpc1 = Src1.getOpcode();
38277 EVT SrcVT0 = Src0.getValueType();
38278 EVT SrcVT1 = Src1.getValueType();
38279
38280 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
38281 return SDValue();
38282
38283 switch (SrcOpc0) {
38284 case X86ISD::MOVDDUP: {
38285 SDValue LHS = Src0.getOperand(0);
38286 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
38287 SDValue Res =
38288 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
38289 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
38290 return DAG.getBitcast(VT, Res);
38291 }
38292 case X86ISD::VPERMILPI:
38293 // TODO: Handle v4f64 permutes with different low/high lane masks.
38294 if (SrcVT0 == MVT::v4f64) {
38295 uint64_t Mask = Src0.getConstantOperandVal(1);
38296 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
38297 break;
38298 }
38299 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38300 case X86ISD::VSHLI:
38301 case X86ISD::VSRLI:
38302 case X86ISD::VSRAI:
38303 case X86ISD::PSHUFD:
38304 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
38305 SDValue LHS = Src0.getOperand(0);
38306 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
38307 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
38308 V.getOperand(2));
38309 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
38310 return DAG.getBitcast(VT, Res);
38311 }
38312 break;
38313 }
38314
38315 return SDValue();
38316}
38317
38318/// Try to combine x86 target specific shuffles.
38319static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
38320 TargetLowering::DAGCombinerInfo &DCI,
38321 const X86Subtarget &Subtarget) {
38322 SDLoc DL(N);
38323 MVT VT = N.getSimpleValueType();
38324 SmallVector<int, 4> Mask;
38325 unsigned Opcode = N.getOpcode();
38326
38327 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
38328 return R;
38329
38330 if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL))
38331 return R;
38332
38333 // Handle specific target shuffles.
38334 switch (Opcode) {
38335 case X86ISD::MOVDDUP: {
38336 SDValue Src = N.getOperand(0);
38337 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
38338 if (VT == MVT::v2f64 && Src.hasOneUse() &&
38339 ISD::isNormalLoad(Src.getNode())) {
38340 LoadSDNode *LN = cast<LoadSDNode>(Src);
38341 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
38342 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
38343 DCI.CombineTo(N.getNode(), Movddup);
38344 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
38345 DCI.recursivelyDeleteUnusedNodes(LN);
38346 return N; // Return N so it doesn't get rechecked!
38347 }
38348 }
38349
38350 return SDValue();
38351 }
38352 case X86ISD::VBROADCAST: {
38353 SDValue Src = N.getOperand(0);
38354 SDValue BC = peekThroughBitcasts(Src);
38355 EVT SrcVT = Src.getValueType();
38356 EVT BCVT = BC.getValueType();
38357
38358 // If broadcasting from another shuffle, attempt to simplify it.
38359 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
38360 if (isTargetShuffle(BC.getOpcode()) &&
38361 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
38362 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
38363 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
38364 SM_SentinelUndef);
38365 for (unsigned i = 0; i != Scale; ++i)
38366 DemandedMask[i] = i;
38367 if (SDValue Res = combineX86ShufflesRecursively(
38368 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
38369 X86::MaxShuffleCombineDepth,
38370 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
38371 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
38372 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
38373 DAG.getBitcast(SrcVT, Res));
38374 }
38375
38376 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
38377 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
38378 if (Src.getOpcode() == ISD::BITCAST &&
38379 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
38380 DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
38381 FixedVectorType::isValidElementType(
38382 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
38383 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
38384 VT.getVectorNumElements());
38385 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
38386 }
38387
38388 // Reduce broadcast source vector to lowest 128-bits.
38389 if (SrcVT.getSizeInBits() > 128)
38390 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
38391 extract128BitVector(Src, 0, DAG, DL));
38392
38393 // broadcast(scalar_to_vector(x)) -> broadcast(x).
38394 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
38395 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
38396
38397 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
38398 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
38399 isNullConstant(Src.getOperand(1)) &&
38400 DAG.getTargetLoweringInfo().isTypeLegal(
38401 Src.getOperand(0).getValueType()))
38402 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
38403
38404 // Share broadcast with the longest vector and extract low subvector (free).
38405 // Ensure the same SDValue from the SDNode use is being used.
38406 for (SDNode *User : Src->uses())
38407 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
38408 Src == User->getOperand(0) &&
38409 User->getValueSizeInBits(0).getFixedSize() >
38410 VT.getFixedSizeInBits()) {
38411 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
38412 VT.getSizeInBits());
38413 }
38414
38415 // vbroadcast(scalarload X) -> vbroadcast_load X
38416 // For float loads, extract other uses of the scalar from the broadcast.
38417 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
38418 ISD::isNormalLoad(Src.getNode())) {
38419 LoadSDNode *LN = cast<LoadSDNode>(Src);
38420 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38421 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
38422 SDValue BcastLd =
38423 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
38424 LN->getMemoryVT(), LN->getMemOperand());
38425 // If the load value is used only by N, replace it via CombineTo N.
38426 bool NoReplaceExtract = Src.hasOneUse();
38427 DCI.CombineTo(N.getNode(), BcastLd);
38428 if (NoReplaceExtract) {
38429 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38430 DCI.recursivelyDeleteUnusedNodes(LN);
38431 } else {
38432 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
38433 DAG.getIntPtrConstant(0, DL));
38434 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
38435 }
38436 return N; // Return N so it doesn't get rechecked!
38437 }
38438
38439 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
38440 // i16. So shrink it ourselves if we can make a broadcast_load.
38441 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
38442 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
38443 assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38443, __extension__ __PRETTY_FUNCTION__))
;
38444 SDValue TruncIn = Src.getOperand(0);
38445
38446 // If this is a truncate of a non extending load we can just narrow it to
38447 // use a broadcast_load.
38448 if (ISD::isNormalLoad(TruncIn.getNode())) {
38449 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
38450 // Unless its volatile or atomic.
38451 if (LN->isSimple()) {
38452 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38453 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
38454 SDValue BcastLd = DAG.getMemIntrinsicNode(
38455 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
38456 LN->getPointerInfo(), LN->getOriginalAlign(),
38457 LN->getMemOperand()->getFlags());
38458 DCI.CombineTo(N.getNode(), BcastLd);
38459 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38460 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
38461 return N; // Return N so it doesn't get rechecked!
38462 }
38463 }
38464
38465 // If this is a truncate of an i16 extload, we can directly replace it.
38466 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
38467 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
38468 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
38469 if (LN->getMemoryVT().getSizeInBits() == 16) {
38470 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38471 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
38472 SDValue BcastLd =
38473 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
38474 LN->getMemoryVT(), LN->getMemOperand());
38475 DCI.CombineTo(N.getNode(), BcastLd);
38476 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38477 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
38478 return N; // Return N so it doesn't get rechecked!
38479 }
38480 }
38481
38482 // If this is a truncate of load that has been shifted right, we can
38483 // offset the pointer and use a narrower load.
38484 if (TruncIn.getOpcode() == ISD::SRL &&
38485 TruncIn.getOperand(0).hasOneUse() &&
38486 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
38487 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
38488 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
38489 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
38490 // Make sure the shift amount and the load size are divisible by 16.
38491 // Don't do this if the load is volatile or atomic.
38492 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
38493 LN->isSimple()) {
38494 unsigned Offset = ShiftAmt / 8;
38495 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38496 SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),
38497 TypeSize::Fixed(Offset), DL);
38498 SDValue Ops[] = { LN->getChain(), Ptr };
38499 SDValue BcastLd = DAG.getMemIntrinsicNode(
38500 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
38501 LN->getPointerInfo().getWithOffset(Offset),
38502 LN->getOriginalAlign(),
38503 LN->getMemOperand()->getFlags());
38504 DCI.CombineTo(N.getNode(), BcastLd);
38505 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38506 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
38507 return N; // Return N so it doesn't get rechecked!
38508 }
38509 }
38510 }
38511
38512 // vbroadcast(vzload X) -> vbroadcast_load X
38513 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
38514 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
38515 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
38516 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38517 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
38518 SDValue BcastLd =
38519 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
38520 LN->getMemoryVT(), LN->getMemOperand());
38521 DCI.CombineTo(N.getNode(), BcastLd);
38522 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38523 DCI.recursivelyDeleteUnusedNodes(LN);
38524 return N; // Return N so it doesn't get rechecked!
38525 }
38526 }
38527
38528 // vbroadcast(vector load X) -> vbroadcast_load
38529 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
38530 SrcVT == MVT::v4i32) &&
38531 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
38532 LoadSDNode *LN = cast<LoadSDNode>(Src);
38533 // Unless the load is volatile or atomic.
38534 if (LN->isSimple()) {
38535 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38536 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38537 SDValue BcastLd = DAG.getMemIntrinsicNode(
38538 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
38539 LN->getPointerInfo(), LN->getOriginalAlign(),
38540 LN->getMemOperand()->getFlags());
38541 DCI.CombineTo(N.getNode(), BcastLd);
38542 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
38543 DCI.recursivelyDeleteUnusedNodes(LN);
38544 return N; // Return N so it doesn't get rechecked!
38545 }
38546 }
38547
38548 return SDValue();
38549 }
38550 case X86ISD::VZEXT_MOVL: {
38551 SDValue N0 = N.getOperand(0);
38552
38553 // If this a vzmovl of a full vector load, replace it with a vzload, unless
38554 // the load is volatile.
38555 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
38556 auto *LN = cast<LoadSDNode>(N0);
38557 if (SDValue VZLoad =
38558 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
38559 DCI.CombineTo(N.getNode(), VZLoad);
38560 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
38561 DCI.recursivelyDeleteUnusedNodes(LN);
38562 return N;
38563 }
38564 }
38565
38566 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
38567 // and can just use a VZEXT_LOAD.
38568 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
38569 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
38570 auto *LN = cast<MemSDNode>(N0);
38571 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
38572 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
38573 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
38574 SDValue VZLoad =
38575 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
38576 LN->getMemoryVT(), LN->getMemOperand());
38577 DCI.CombineTo(N.getNode(), VZLoad);
38578 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
38579 DCI.recursivelyDeleteUnusedNodes(LN);
38580 return N;
38581 }
38582 }
38583
38584 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
38585 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
38586 // if the upper bits of the i64 are zero.
38587 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38588 N0.getOperand(0).hasOneUse() &&
38589 N0.getOperand(0).getValueType() == MVT::i64) {
38590 SDValue In = N0.getOperand(0);
38591 APInt Mask = APInt::getHighBitsSet(64, 32);
38592 if (DAG.MaskedValueIsZero(In, Mask)) {
38593 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
38594 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
38595 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
38596 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
38597 return DAG.getBitcast(VT, Movl);
38598 }
38599 }
38600
38601 // Load a scalar integer constant directly to XMM instead of transferring an
38602 // immediate value from GPR.
38603 // vzext_movl (scalar_to_vector C) --> load [C,0...]
38604 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
38605 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
38606 // Create a vector constant - scalar constant followed by zeros.
38607 EVT ScalarVT = N0.getOperand(0).getValueType();
38608 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
38609 unsigned NumElts = VT.getVectorNumElements();
38610 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
38611 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
38612 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
38613
38614 // Load the vector constant from constant pool.
38615 MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
38616 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
38617 MachinePointerInfo MPI =
38618 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
38619 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
38620 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
38621 MachineMemOperand::MOLoad);
38622 }
38623 }
38624
38625 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
38626 // insert into a zero vector. This helps get VZEXT_MOVL closer to
38627 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
38628 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
38629 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
38630 SDValue V = peekThroughOneUseBitcasts(N0);
38631
38632 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
38633 isNullConstant(V.getOperand(2))) {
38634 SDValue In = V.getOperand(1);
38635 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
38636 In.getValueSizeInBits() /
38637 VT.getScalarSizeInBits());
38638 In = DAG.getBitcast(SubVT, In);
38639 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
38640 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
38641 getZeroVector(VT, Subtarget, DAG, DL), Movl,
38642 V.getOperand(2));
38643 }
38644 }
38645
38646 return SDValue();
38647 }
38648 case X86ISD::BLENDI: {
38649 SDValue N0 = N.getOperand(0);
38650 SDValue N1 = N.getOperand(1);
38651
38652 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
38653 // TODO: Handle MVT::v16i16 repeated blend mask.
38654 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
38655 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
38656 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
38657 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
38658 SrcVT.getScalarSizeInBits() >= 32) {
38659 unsigned BlendMask = N.getConstantOperandVal(2);
38660 unsigned Size = VT.getVectorNumElements();
38661 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
38662 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
38663 return DAG.getBitcast(
38664 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
38665 N1.getOperand(0),
38666 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
38667 }
38668 }
38669 return SDValue();
38670 }
38671 case X86ISD::VPERMI: {
38672 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
38673 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
38674 SDValue N0 = N.getOperand(0);
38675 SDValue N1 = N.getOperand(1);
38676 unsigned EltSizeInBits = VT.getScalarSizeInBits();
38677 if (N0.getOpcode() == ISD::BITCAST &&
38678 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
38679 SDValue Src = N0.getOperand(0);
38680 EVT SrcVT = Src.getValueType();
38681 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
38682 return DAG.getBitcast(VT, Res);
38683 }
38684 return SDValue();
38685 }
38686 case X86ISD::VPERM2X128: {
38687 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
38688 SDValue LHS = N->getOperand(0);
38689 SDValue RHS = N->getOperand(1);
38690 if (LHS.getOpcode() == ISD::BITCAST &&
38691 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
38692 EVT SrcVT = LHS.getOperand(0).getValueType();
38693 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
38694 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
38695 DAG.getBitcast(SrcVT, LHS),
38696 DAG.getBitcast(SrcVT, RHS),
38697 N->getOperand(2)));
38698 }
38699 }
38700
38701 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
38702 if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
38703 return Res;
38704
38705 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
38706 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
38707 auto FindSubVector128 = [&](unsigned Idx) {
38708 if (Idx > 3)
38709 return SDValue();
38710 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
38711 SmallVector<SDValue> SubOps;
38712 if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
38713 return SubOps[Idx & 1];
38714 unsigned NumElts = Src.getValueType().getVectorNumElements();
38715 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
38716 Src.getOperand(1).getValueSizeInBits() == 128 &&
38717 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
38718 return Src.getOperand(1);
38719 }
38720 return SDValue();
38721 };
38722 unsigned Imm = N.getConstantOperandVal(2);
38723 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
38724 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
38725 MVT SubVT = VT.getHalfNumVectorElementsVT();
38726 SubLo = DAG.getBitcast(SubVT, SubLo);
38727 SubHi = DAG.getBitcast(SubVT, SubHi);
38728 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
38729 }
38730 }
38731 return SDValue();
38732 }
38733 case X86ISD::PSHUFD:
38734 case X86ISD::PSHUFLW:
38735 case X86ISD::PSHUFHW:
38736 Mask = getPSHUFShuffleMask(N);
38737 assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38737, __extension__ __PRETTY_FUNCTION__))
;
38738 break;
38739 case X86ISD::MOVSD:
38740 case X86ISD::MOVSH:
38741 case X86ISD::MOVSS: {
38742 SDValue N0 = N.getOperand(0);
38743 SDValue N1 = N.getOperand(1);
38744
38745 // Canonicalize scalar FPOps:
38746 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
38747 // If commutable, allow OP(N1[0], N0[0]).
38748 unsigned Opcode1 = N1.getOpcode();
38749 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
38750 Opcode1 == ISD::FDIV) {
38751 SDValue N10 = N1.getOperand(0);
38752 SDValue N11 = N1.getOperand(1);
38753 if (N10 == N0 ||
38754 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
38755 if (N10 != N0)
38756 std::swap(N10, N11);
38757 MVT SVT = VT.getVectorElementType();
38758 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
38759 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
38760 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
38761 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
38762 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
38763 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
38764 }
38765 }
38766
38767 return SDValue();
38768 }
38769 case X86ISD::INSERTPS: {
38770 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38770, __extension__ __PRETTY_FUNCTION__))
;
38771 SDValue Op0 = N.getOperand(0);
38772 SDValue Op1 = N.getOperand(1);
38773 unsigned InsertPSMask = N.getConstantOperandVal(2);
38774 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
38775 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
38776 unsigned ZeroMask = InsertPSMask & 0xF;
38777
38778 // If we zero out all elements from Op0 then we don't need to reference it.
38779 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
38780 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
38781 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38782
38783 // If we zero out the element from Op1 then we don't need to reference it.
38784 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
38785 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38786 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38787
38788 // Attempt to merge insertps Op1 with an inner target shuffle node.
38789 SmallVector<int, 8> TargetMask1;
38790 SmallVector<SDValue, 2> Ops1;
38791 APInt KnownUndef1, KnownZero1;
38792 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
38793 KnownZero1)) {
38794 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
38795 // Zero/UNDEF insertion - zero out element and remove dependency.
38796 InsertPSMask |= (1u << DstIdx);
38797 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
38798 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38799 }
38800 // Update insertps mask srcidx and reference the source input directly.
38801 int M = TargetMask1[SrcIdx];
38802 assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38802, __extension__ __PRETTY_FUNCTION__))
;
38803 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
38804 Op1 = Ops1[M < 4 ? 0 : 1];
38805 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38806 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38807 }
38808
38809 // Attempt to merge insertps Op0 with an inner target shuffle node.
38810 SmallVector<int, 8> TargetMask0;
38811 SmallVector<SDValue, 2> Ops0;
38812 APInt KnownUndef0, KnownZero0;
38813 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
38814 KnownZero0)) {
38815 bool Updated = false;
38816 bool UseInput00 = false;
38817 bool UseInput01 = false;
38818 for (int i = 0; i != 4; ++i) {
38819 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
38820 // No change if element is already zero or the inserted element.
38821 continue;
38822 } else if (KnownUndef0[i] || KnownZero0[i]) {
38823 // If the target mask is undef/zero then we must zero the element.
38824 InsertPSMask |= (1u << i);
38825 Updated = true;
38826 continue;
38827 }
38828
38829 // The input vector element must be inline.
38830 int M = TargetMask0[i];
38831 if (M != i && M != (i + 4))
38832 return SDValue();
38833
38834 // Determine which inputs of the target shuffle we're using.
38835 UseInput00 |= (0 <= M && M < 4);
38836 UseInput01 |= (4 <= M);
38837 }
38838
38839 // If we're not using both inputs of the target shuffle then use the
38840 // referenced input directly.
38841 if (UseInput00 && !UseInput01) {
38842 Updated = true;
38843 Op0 = Ops0[0];
38844 } else if (!UseInput00 && UseInput01) {
38845 Updated = true;
38846 Op0 = Ops0[1];
38847 }
38848
38849 if (Updated)
38850 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
38851 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
38852 }
38853
38854 // If we're inserting an element from a vbroadcast load, fold the
38855 // load into the X86insertps instruction. We need to convert the scalar
38856 // load to a vector and clear the source lane of the INSERTPS control.
38857 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
38858 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
38859 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
38860 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
38861 MemIntr->getBasePtr(),
38862 MemIntr->getMemOperand());
38863 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
38864 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
38865 Load),
38866 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
38867 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
38868 return Insert;
38869 }
38870 }
38871
38872 return SDValue();
38873 }
38874 default:
38875 return SDValue();
38876 }
38877
38878 // Nuke no-op shuffles that show up after combining.
38879 if (isNoopShuffleMask(Mask))
38880 return N.getOperand(0);
38881
38882 // Look for simplifications involving one or two shuffle instructions.
38883 SDValue V = N.getOperand(0);
38884 switch (N.getOpcode()) {
38885 default:
38886 break;
38887 case X86ISD::PSHUFLW:
38888 case X86ISD::PSHUFHW:
38889 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38889, __extension__ __PRETTY_FUNCTION__))
;
38890
38891 // See if this reduces to a PSHUFD which is no more expensive and can
38892 // combine with more operations. Note that it has to at least flip the
38893 // dwords as otherwise it would have been removed as a no-op.
38894 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
38895 int DMask[] = {0, 1, 2, 3};
38896 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
38897 DMask[DOffset + 0] = DOffset + 1;
38898 DMask[DOffset + 1] = DOffset + 0;
38899 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
38900 V = DAG.getBitcast(DVT, V);
38901 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
38902 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
38903 return DAG.getBitcast(VT, V);
38904 }
38905
38906 // Look for shuffle patterns which can be implemented as a single unpack.
38907 // FIXME: This doesn't handle the location of the PSHUFD generically, and
38908 // only works when we have a PSHUFD followed by two half-shuffles.
38909 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
38910 (V.getOpcode() == X86ISD::PSHUFLW ||
38911 V.getOpcode() == X86ISD::PSHUFHW) &&
38912 V.getOpcode() != N.getOpcode() &&
38913 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
38914 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
38915 if (D.getOpcode() == X86ISD::PSHUFD) {
38916 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
38917 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
38918 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38919 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
38920 int WordMask[8];
38921 for (int i = 0; i < 4; ++i) {
38922 WordMask[i + NOffset] = Mask[i] + NOffset;
38923 WordMask[i + VOffset] = VMask[i] + VOffset;
38924 }
38925 // Map the word mask through the DWord mask.
38926 int MappedMask[8];
38927 for (int i = 0; i < 8; ++i)
38928 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
38929 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
38930 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
38931 // We can replace all three shuffles with an unpack.
38932 V = DAG.getBitcast(VT, D.getOperand(0));
38933 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
38934 : X86ISD::UNPCKH,
38935 DL, VT, V, V);
38936 }
38937 }
38938 }
38939
38940 break;
38941
38942 case X86ISD::PSHUFD:
38943 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
38944 return NewN;
38945
38946 break;
38947 }
38948
38949 return SDValue();
38950}
38951
38952/// Checks if the shuffle mask takes subsequent elements
38953/// alternately from two vectors.
38954/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
38955static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
38956
38957 int ParitySrc[2] = {-1, -1};
38958 unsigned Size = Mask.size();
38959 for (unsigned i = 0; i != Size; ++i) {
38960 int M = Mask[i];
38961 if (M < 0)
38962 continue;
38963
38964 // Make sure we are using the matching element from the input.
38965 if ((M % Size) != i)
38966 return false;
38967
38968 // Make sure we use the same input for all elements of the same parity.
38969 int Src = M / Size;
38970 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
38971 return false;
38972 ParitySrc[i % 2] = Src;
38973 }
38974
38975 // Make sure each input is used.
38976 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
38977 return false;
38978
38979 Op0Even = ParitySrc[0] == 0;
38980 return true;
38981}
38982
38983/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
38984/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
38985/// are written to the parameters \p Opnd0 and \p Opnd1.
38986///
38987/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
38988/// so it is easier to generically match. We also insert dummy vector shuffle
38989/// nodes for the operands which explicitly discard the lanes which are unused
38990/// by this operation to try to flow through the rest of the combiner
38991/// the fact that they're unused.
38992static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
38993 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
38994 bool &IsSubAdd) {
38995
38996 EVT VT = N->getValueType(0);
38997 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38998 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
38999 !VT.getSimpleVT().isFloatingPoint())
39000 return false;
39001
39002 // We only handle target-independent shuffles.
39003 // FIXME: It would be easy and harmless to use the target shuffle mask
39004 // extraction tool to support more.
39005 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
39006 return false;
39007
39008 SDValue V1 = N->getOperand(0);
39009 SDValue V2 = N->getOperand(1);
39010
39011 // Make sure we have an FADD and an FSUB.
39012 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
39013 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
39014 V1.getOpcode() == V2.getOpcode())
39015 return false;
39016
39017 // If there are other uses of these operations we can't fold them.
39018 if (!V1->hasOneUse() || !V2->hasOneUse())
39019 return false;
39020
39021 // Ensure that both operations have the same operands. Note that we can
39022 // commute the FADD operands.
39023 SDValue LHS, RHS;
39024 if (V1.getOpcode() == ISD::FSUB) {
39025 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
39026 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
39027 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
39028 return false;
39029 } else {
39030 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39030, __extension__ __PRETTY_FUNCTION__))
;
39031 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
39032 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
39033 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
39034 return false;
39035 }
39036
39037 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
39038 bool Op0Even;
39039 if (!isAddSubOrSubAddMask(Mask, Op0Even))
39040 return false;
39041
39042 // It's a subadd if the vector in the even parity is an FADD.
39043 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
39044 : V2->getOpcode() == ISD::FADD;
39045
39046 Opnd0 = LHS;
39047 Opnd1 = RHS;
39048 return true;
39049}
39050
39051/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
39052static SDValue combineShuffleToFMAddSub(SDNode *N,
39053 const X86Subtarget &Subtarget,
39054 SelectionDAG &DAG) {
39055 // We only handle target-independent shuffles.
39056 // FIXME: It would be easy and harmless to use the target shuffle mask
39057 // extraction tool to support more.
39058 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
39059 return SDValue();
39060
39061 MVT VT = N->getSimpleValueType(0);
39062 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39063 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
39064 return SDValue();
39065
39066 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
39067 SDValue Op0 = N->getOperand(0);
39068 SDValue Op1 = N->getOperand(1);
39069 SDValue FMAdd = Op0, FMSub = Op1;
39070 if (FMSub.getOpcode() != X86ISD::FMSUB)
39071 std::swap(FMAdd, FMSub);
39072
39073 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
39074 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
39075 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
39076 FMAdd.getOperand(2) != FMSub.getOperand(2))
39077 return SDValue();
39078
39079 // Check for correct shuffle mask.
39080 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
39081 bool Op0Even;
39082 if (!isAddSubOrSubAddMask(Mask, Op0Even))
39083 return SDValue();
39084
39085 // FMAddSub takes zeroth operand from FMSub node.
39086 SDLoc DL(N);
39087 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
39088 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
39089 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
39090 FMAdd.getOperand(2));
39091}
39092
39093/// Try to combine a shuffle into a target-specific add-sub or
39094/// mul-add-sub node.
39095static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
39096 const X86Subtarget &Subtarget,
39097 SelectionDAG &DAG) {
39098 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
39099 return V;
39100
39101 SDValue Opnd0, Opnd1;
39102 bool IsSubAdd;
39103 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
39104 return SDValue();
39105
39106 MVT VT = N->getSimpleValueType(0);
39107 SDLoc DL(N);
39108
39109 // Try to generate X86ISD::FMADDSUB node here.
39110 SDValue Opnd2;
39111 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
39112 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
39113 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
39114 }
39115
39116 if (IsSubAdd)
39117 return SDValue();
39118
39119 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
39120 // the ADDSUB idiom has been successfully recognized. There are no known
39121 // X86 targets with 512-bit ADDSUB instructions!
39122 if (VT.is512BitVector())
39123 return SDValue();
39124
39125 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
39126 // the ADDSUB idiom has been successfully recognized. There are no known
39127 // X86 targets with FP16 ADDSUB instructions!
39128 if (VT.getVectorElementType() == MVT::f16)
39129 return SDValue();
39130
39131 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
39132}
39133
39134// We are looking for a shuffle where both sources are concatenated with undef
39135// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
39136// if we can express this as a single-source shuffle, that's preferable.
39137static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
39138 const X86Subtarget &Subtarget) {
39139 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
39140 return SDValue();
39141
39142 EVT VT = N->getValueType(0);
39143
39144 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
39145 if (!VT.is128BitVector() && !VT.is256BitVector())
39146 return SDValue();
39147
39148 if (VT.getVectorElementType() != MVT::i32 &&
39149 VT.getVectorElementType() != MVT::i64 &&
39150 VT.getVectorElementType() != MVT::f32 &&
39151 VT.getVectorElementType() != MVT::f64)
39152 return SDValue();
39153
39154 SDValue N0 = N->getOperand(0);
39155 SDValue N1 = N->getOperand(1);
39156
39157 // Check that both sources are concats with undef.
39158 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
39159 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
39160 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
39161 !N1.getOperand(1).isUndef())
39162 return SDValue();
39163
39164 // Construct the new shuffle mask. Elements from the first source retain their
39165 // index, but elements from the second source no longer need to skip an undef.
39166 SmallVector<int, 8> Mask;
39167 int NumElts = VT.getVectorNumElements();
39168
39169 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
39170 for (int Elt : SVOp->getMask())
39171 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
39172
39173 SDLoc DL(N);
39174 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
39175 N1.getOperand(0));
39176 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
39177}
39178
39179/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
39180/// low half of each source vector and does not set any high half elements in
39181/// the destination vector, narrow the shuffle to half its original size.
39182static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
39183 if (!Shuf->getValueType(0).isSimple())
39184 return SDValue();
39185 MVT VT = Shuf->getSimpleValueType(0);
39186 if (!VT.is256BitVector() && !VT.is512BitVector())
39187 return SDValue();
39188
39189 // See if we can ignore all of the high elements of the shuffle.
39190 ArrayRef<int> Mask = Shuf->getMask();
39191 if (!isUndefUpperHalf(Mask))
39192 return SDValue();
39193
39194 // Check if the shuffle mask accesses only the low half of each input vector
39195 // (half-index output is 0 or 2).
39196 int HalfIdx1, HalfIdx2;
39197 SmallVector<int, 8> HalfMask(Mask.size() / 2);
39198 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
39199 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
39200 return SDValue();
39201
39202 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
39203 // The trick is knowing that all of the insert/extract are actually free
39204 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
39205 // of narrow inputs into a narrow output, and that is always cheaper than
39206 // the wide shuffle that we started with.
39207 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
39208 Shuf->getOperand(1), HalfMask, HalfIdx1,
39209 HalfIdx2, false, DAG, /*UseConcat*/true);
39210}
39211
39212static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
39213 TargetLowering::DAGCombinerInfo &DCI,
39214 const X86Subtarget &Subtarget) {
39215 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
39216 if (SDValue V = narrowShuffle(Shuf, DAG))
39217 return V;
39218
39219 // If we have legalized the vector types, look for blends of FADD and FSUB
39220 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
39221 SDLoc dl(N);
39222 EVT VT = N->getValueType(0);
39223 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39224 if (TLI.isTypeLegal(VT))
39225 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
39226 return AddSub;
39227
39228 // Attempt to combine into a vector load/broadcast.
39229 if (SDValue LD = combineToConsecutiveLoads(
39230 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
39231 return LD;
39232
39233 // For AVX2, we sometimes want to combine
39234 // (vector_shuffle <mask> (concat_vectors t1, undef)
39235 // (concat_vectors t2, undef))
39236 // Into:
39237 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
39238 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
39239 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
39240 return ShufConcat;
39241
39242 if (isTargetShuffle(N->getOpcode())) {
39243 SDValue Op(N, 0);
39244 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
39245 return Shuffle;
39246
39247 // Try recursively combining arbitrary sequences of x86 shuffle
39248 // instructions into higher-order shuffles. We do this after combining
39249 // specific PSHUF instruction sequences into their minimal form so that we
39250 // can evaluate how many specialized shuffle instructions are involved in
39251 // a particular chain.
39252 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
39253 return Res;
39254
39255 // Simplify source operands based on shuffle mask.
39256 // TODO - merge this into combineX86ShufflesRecursively.
39257 APInt KnownUndef, KnownZero;
39258 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
39259 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
39260 DCI))
39261 return SDValue(N, 0);
39262 }
39263
39264 return SDValue();
39265}
39266
39267// Simplify variable target shuffle masks based on the demanded elements.
39268// TODO: Handle DemandedBits in mask indices as well?
39269bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
39270 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
39271 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
39272 // If we're demanding all elements don't bother trying to simplify the mask.
39273 unsigned NumElts = DemandedElts.getBitWidth();
39274 if (DemandedElts.isAllOnesValue())
39275 return false;
39276
39277 SDValue Mask = Op.getOperand(MaskIndex);
39278 if (!Mask.hasOneUse())
39279 return false;
39280
39281 // Attempt to generically simplify the variable shuffle mask.
39282 APInt MaskUndef, MaskZero;
39283 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
39284 Depth + 1))
39285 return true;
39286
39287 // Attempt to extract+simplify a (constant pool load) shuffle mask.
39288 // TODO: Support other types from getTargetShuffleMaskIndices?
39289 SDValue BC = peekThroughOneUseBitcasts(Mask);
39290 EVT BCVT = BC.getValueType();
39291 auto *Load = dyn_cast<LoadSDNode>(BC);
39292 if (!Load)
39293 return false;
39294
39295 const Constant *C = getTargetConstantFromNode(Load);
39296 if (!C)
39297 return false;
39298
39299 Type *CTy = C->getType();
39300 if (!CTy->isVectorTy() ||
39301 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
39302 return false;
39303
39304 // Handle scaling for i64 elements on 32-bit targets.
39305 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
39306 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
39307 return false;
39308 unsigned Scale = NumCstElts / NumElts;
39309
39310 // Simplify mask if we have an undemanded element that is not undef.
39311 bool Simplified = false;
39312 SmallVector<Constant *, 32> ConstVecOps;
39313 for (unsigned i = 0; i != NumCstElts; ++i) {
39314 Constant *Elt = C->getAggregateElement(i);
39315 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
39316 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
39317 Simplified = true;
39318 continue;
39319 }
39320 ConstVecOps.push_back(Elt);
39321 }
39322 if (!Simplified)
39323 return false;
39324
39325 // Generate new constant pool entry + legalize immediately for the load.
39326 SDLoc DL(Op);
39327 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
39328 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
39329 SDValue NewMask = TLO.DAG.getLoad(
39330 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
39331 MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
39332 Load->getAlign());
39333 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
39334}
39335
39336bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
39337 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
39338 TargetLoweringOpt &TLO, unsigned Depth) const {
39339 int NumElts = DemandedElts.getBitWidth();
39340 unsigned Opc = Op.getOpcode();
39341 EVT VT = Op.getValueType();
39342
39343 // Handle special case opcodes.
39344 switch (Opc) {
39345 case X86ISD::PMULDQ:
39346 case X86ISD::PMULUDQ: {
39347 APInt LHSUndef, LHSZero;
39348 APInt RHSUndef, RHSZero;
39349 SDValue LHS = Op.getOperand(0);
39350 SDValue RHS = Op.getOperand(1);
39351 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
39352 Depth + 1))
39353 return true;
39354 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
39355 Depth + 1))
39356 return true;
39357 // Multiply by zero.
39358 KnownZero = LHSZero | RHSZero;
39359 break;
39360 }
39361 case X86ISD::VSHL:
39362 case X86ISD::VSRL:
39363 case X86ISD::VSRA: {
39364 // We only need the bottom 64-bits of the (128-bit) shift amount.
39365 SDValue Amt = Op.getOperand(1);
39366 MVT AmtVT = Amt.getSimpleValueType();
39367 assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39367, __extension__ __PRETTY_FUNCTION__))
;
39368
39369 // If we reuse the shift amount just for sse shift amounts then we know that
39370 // only the bottom 64-bits are only ever used.
39371 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
39372 unsigned UseOpc = Use->getOpcode();
39373 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
39374 UseOpc == X86ISD::VSRA) &&
39375 Use->getOperand(0) != Amt;
39376 });
39377
39378 APInt AmtUndef, AmtZero;
39379 unsigned NumAmtElts = AmtVT.getVectorNumElements();
39380 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
39381 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
39382 Depth + 1, AssumeSingleUse))
39383 return true;
39384 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39385 }
39386 case X86ISD::VSHLI:
39387 case X86ISD::VSRLI:
39388 case X86ISD::VSRAI: {
39389 SDValue Src = Op.getOperand(0);
39390 APInt SrcUndef;
39391 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
39392 Depth + 1))
39393 return true;
39394
39395 // Aggressively peek through ops to get at the demanded elts.
39396 if (!DemandedElts.isAllOnesValue())
39397 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
39398 Src, DemandedElts, TLO.DAG, Depth + 1))
39399 return TLO.CombineTo(
39400 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
39401 break;
39402 }
39403 case X86ISD::KSHIFTL: {
39404 SDValue Src = Op.getOperand(0);
39405 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
39406 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39406, __extension__ __PRETTY_FUNCTION__))
;
39407 unsigned ShiftAmt = Amt->getZExtValue();
39408
39409 if (ShiftAmt == 0)
39410 return TLO.CombineTo(Op, Src);
39411
39412 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
39413 // single shift. We can do this if the bottom bits (which are shifted
39414 // out) are never demanded.
39415 if (Src.getOpcode() == X86ISD::KSHIFTR) {
39416 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
39417 unsigned C1 = Src.getConstantOperandVal(1);
39418 unsigned NewOpc = X86ISD::KSHIFTL;
39419 int Diff = ShiftAmt - C1;
39420 if (Diff < 0) {
39421 Diff = -Diff;
39422 NewOpc = X86ISD::KSHIFTR;
39423 }
39424
39425 SDLoc dl(Op);
39426 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
39427 return TLO.CombineTo(
39428 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
39429 }
39430 }
39431
39432 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
39433 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
39434 Depth + 1))
39435 return true;
39436
39437 KnownUndef <<= ShiftAmt;
39438 KnownZero <<= ShiftAmt;
39439 KnownZero.setLowBits(ShiftAmt);
39440 break;
39441 }
39442 case X86ISD::KSHIFTR: {
39443 SDValue Src = Op.getOperand(0);
39444 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
39445 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39445, __extension__ __PRETTY_FUNCTION__))
;
39446 unsigned ShiftAmt = Amt->getZExtValue();
39447
39448 if (ShiftAmt == 0)
39449 return TLO.CombineTo(Op, Src);
39450
39451 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
39452 // single shift. We can do this if the top bits (which are shifted
39453 // out) are never demanded.
39454 if (Src.getOpcode() == X86ISD::KSHIFTL) {
39455 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
39456 unsigned C1 = Src.getConstantOperandVal(1);
39457 unsigned NewOpc = X86ISD::KSHIFTR;
39458 int Diff = ShiftAmt - C1;
39459 if (Diff < 0) {
39460 Diff = -Diff;
39461 NewOpc = X86ISD::KSHIFTL;
39462 }
39463
39464 SDLoc dl(Op);
39465 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
39466 return TLO.CombineTo(
39467 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
39468 }
39469 }
39470
39471 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
39472 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
39473 Depth + 1))
39474 return true;
39475
39476 KnownUndef.lshrInPlace(ShiftAmt);
39477 KnownZero.lshrInPlace(ShiftAmt);
39478 KnownZero.setHighBits(ShiftAmt);
39479 break;
39480 }
39481 case X86ISD::CVTSI2P:
39482 case X86ISD::CVTUI2P: {
39483 SDValue Src = Op.getOperand(0);
39484 MVT SrcVT = Src.getSimpleValueType();
39485 APInt SrcUndef, SrcZero;
39486 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39487 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
39488 Depth + 1))
39489 return true;
39490 break;
39491 }
39492 case X86ISD::PACKSS:
39493 case X86ISD::PACKUS: {
39494 SDValue N0 = Op.getOperand(0);
39495 SDValue N1 = Op.getOperand(1);
39496
39497 APInt DemandedLHS, DemandedRHS;
39498 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
39499
39500 APInt LHSUndef, LHSZero;
39501 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
39502 Depth + 1))
39503 return true;
39504 APInt RHSUndef, RHSZero;
39505 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
39506 Depth + 1))
39507 return true;
39508
39509 // TODO - pass on known zero/undef.
39510
39511 // Aggressively peek through ops to get at the demanded elts.
39512 // TODO - we should do this for all target/faux shuffles ops.
39513 if (!DemandedElts.isAllOnesValue()) {
39514 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
39515 TLO.DAG, Depth + 1);
39516 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
39517 TLO.DAG, Depth + 1);
39518 if (NewN0 || NewN1) {
39519 NewN0 = NewN0 ? NewN0 : N0;
39520 NewN1 = NewN1 ? NewN1 : N1;
39521 return TLO.CombineTo(Op,
39522 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
39523 }
39524 }
39525 break;
39526 }
39527 case X86ISD::HADD:
39528 case X86ISD::HSUB:
39529 case X86ISD::FHADD:
39530 case X86ISD::FHSUB: {
39531 SDValue N0 = Op.getOperand(0);
39532 SDValue N1 = Op.getOperand(1);
39533
39534 APInt DemandedLHS, DemandedRHS;
39535 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
39536
39537 APInt LHSUndef, LHSZero;
39538 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
39539 Depth + 1))
39540 return true;
39541 APInt RHSUndef, RHSZero;
39542 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
39543 Depth + 1))
39544 return true;
39545
39546 // TODO - pass on known zero/undef.
39547
39548 // Aggressively peek through ops to get at the demanded elts.
39549 // TODO: Handle repeated operands.
39550 if (N0 != N1 && !DemandedElts.isAllOnesValue()) {
39551 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
39552 TLO.DAG, Depth + 1);
39553 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
39554 TLO.DAG, Depth + 1);
39555 if (NewN0 || NewN1) {
39556 NewN0 = NewN0 ? NewN0 : N0;
39557 NewN1 = NewN1 ? NewN1 : N1;
39558 return TLO.CombineTo(Op,
39559 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
39560 }
39561 }
39562 break;
39563 }
39564 case X86ISD::VTRUNC:
39565 case X86ISD::VTRUNCS:
39566 case X86ISD::VTRUNCUS: {
39567 SDValue Src = Op.getOperand(0);
39568 MVT SrcVT = Src.getSimpleValueType();
39569 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39570 APInt SrcUndef, SrcZero;
39571 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
39572 Depth + 1))
39573 return true;
39574 KnownZero = SrcZero.zextOrTrunc(NumElts);
39575 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
39576 break;
39577 }
39578 case X86ISD::BLENDV: {
39579 APInt SelUndef, SelZero;
39580 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
39581 SelZero, TLO, Depth + 1))
39582 return true;
39583
39584 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
39585 APInt LHSUndef, LHSZero;
39586 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
39587 LHSZero, TLO, Depth + 1))
39588 return true;
39589
39590 APInt RHSUndef, RHSZero;
39591 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
39592 RHSZero, TLO, Depth + 1))
39593 return true;
39594
39595 KnownZero = LHSZero & RHSZero;
39596 KnownUndef = LHSUndef & RHSUndef;
39597 break;
39598 }
39599 case X86ISD::VZEXT_MOVL: {
39600 // If upper demanded elements are already zero then we have nothing to do.
39601 SDValue Src = Op.getOperand(0);
39602 APInt DemandedUpperElts = DemandedElts;
39603 DemandedUpperElts.clearLowBits(1);
39604 if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
39605 return TLO.CombineTo(Op, Src);
39606 break;
39607 }
39608 case X86ISD::VBROADCAST: {
39609 SDValue Src = Op.getOperand(0);
39610 MVT SrcVT = Src.getSimpleValueType();
39611 if (!SrcVT.isVector())
39612 break;
39613 // Don't bother broadcasting if we just need the 0'th element.
39614 if (DemandedElts == 1) {
39615 if (Src.getValueType() != VT)
39616 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
39617 SDLoc(Op));
39618 return TLO.CombineTo(Op, Src);
39619 }
39620 APInt SrcUndef, SrcZero;
39621 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
39622 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
39623 Depth + 1))
39624 return true;
39625 // Aggressively peek through src to get at the demanded elt.
39626 // TODO - we should do this for all target/faux shuffles ops.
39627 if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
39628 Src, SrcElts, TLO.DAG, Depth + 1))
39629 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
39630 break;
39631 }
39632 case X86ISD::VPERMV:
39633 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
39634 Depth))
39635 return true;
39636 break;
39637 case X86ISD::PSHUFB:
39638 case X86ISD::VPERMV3:
39639 case X86ISD::VPERMILPV:
39640 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
39641 Depth))
39642 return true;
39643 break;
39644 case X86ISD::VPPERM:
39645 case X86ISD::VPERMIL2:
39646 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
39647 Depth))
39648 return true;
39649 break;
39650 }
39651
39652 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
39653 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
39654 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
39655 if ((VT.is256BitVector() || VT.is512BitVector()) &&
39656 DemandedElts.lshr(NumElts / 2) == 0) {
39657 unsigned SizeInBits = VT.getSizeInBits();
39658 unsigned ExtSizeInBits = SizeInBits / 2;
39659
39660 // See if 512-bit ops only use the bottom 128-bits.
39661 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
39662 ExtSizeInBits = SizeInBits / 4;
39663
39664 switch (Opc) {
39665 // Scalar broadcast.
39666 case X86ISD::VBROADCAST: {
39667 SDLoc DL(Op);
39668 SDValue Src = Op.getOperand(0);
39669 if (Src.getValueSizeInBits() > ExtSizeInBits)
39670 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
39671 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39672 ExtSizeInBits / VT.getScalarSizeInBits());
39673 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
39674 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39675 TLO.DAG, DL, ExtSizeInBits));
39676 }
39677 case X86ISD::VBROADCAST_LOAD: {
39678 SDLoc DL(Op);
39679 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
39680 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39681 ExtSizeInBits / VT.getScalarSizeInBits());
39682 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39683 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39684 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
39685 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
39686 MemIntr->getMemOperand());
39687 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39688 Bcst.getValue(1));
39689 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39690 TLO.DAG, DL, ExtSizeInBits));
39691 }
39692 // Subvector broadcast.
39693 case X86ISD::SUBV_BROADCAST_LOAD: {
39694 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
39695 EVT MemVT = MemIntr->getMemoryVT();
39696 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
39697 SDLoc DL(Op);
39698 SDValue Ld =
39699 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
39700 MemIntr->getBasePtr(), MemIntr->getMemOperand());
39701 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39702 Ld.getValue(1));
39703 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
39704 TLO.DAG, DL, ExtSizeInBits));
39705 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
39706 SDLoc DL(Op);
39707 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
39708 ExtSizeInBits / VT.getScalarSizeInBits());
39709 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
39710 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
39711 SDValue Bcst =
39712 TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys,
39713 Ops, MemVT, MemIntr->getMemOperand());
39714 TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),
39715 Bcst.getValue(1));
39716 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
39717 TLO.DAG, DL, ExtSizeInBits));
39718 }
39719 break;
39720 }
39721 // Byte shifts by immediate.
39722 case X86ISD::VSHLDQ:
39723 case X86ISD::VSRLDQ:
39724 // Shift by uniform.
39725 case X86ISD::VSHL:
39726 case X86ISD::VSRL:
39727 case X86ISD::VSRA:
39728 // Shift by immediate.
39729 case X86ISD::VSHLI:
39730 case X86ISD::VSRLI:
39731 case X86ISD::VSRAI: {
39732 SDLoc DL(Op);
39733 SDValue Ext0 =
39734 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
39735 SDValue ExtOp =
39736 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
39737 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39738 SDValue Insert =
39739 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39740 return TLO.CombineTo(Op, Insert);
39741 }
39742 case X86ISD::VPERMI: {
39743 // Simplify PERMPD/PERMQ to extract_subvector.
39744 // TODO: This should be done in shuffle combining.
39745 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
39746 SmallVector<int, 4> Mask;
39747 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
39748 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
39749 SDLoc DL(Op);
39750 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
39751 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39752 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
39753 return TLO.CombineTo(Op, Insert);
39754 }
39755 }
39756 break;
39757 }
39758 case X86ISD::VPERM2X128: {
39759 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
39760 SDLoc DL(Op);
39761 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
39762 if (LoMask & 0x8)
39763 return TLO.CombineTo(
39764 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
39765 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
39766 unsigned SrcIdx = (LoMask & 0x2) >> 1;
39767 SDValue ExtOp =
39768 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
39769 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39770 SDValue Insert =
39771 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39772 return TLO.CombineTo(Op, Insert);
39773 }
39774 // Zero upper elements.
39775 case X86ISD::VZEXT_MOVL:
39776 // Target unary shuffles by immediate:
39777 case X86ISD::PSHUFD:
39778 case X86ISD::PSHUFLW:
39779 case X86ISD::PSHUFHW:
39780 case X86ISD::VPERMILPI:
39781 // (Non-Lane Crossing) Target Shuffles.
39782 case X86ISD::VPERMILPV:
39783 case X86ISD::VPERMIL2:
39784 case X86ISD::PSHUFB:
39785 case X86ISD::UNPCKL:
39786 case X86ISD::UNPCKH:
39787 case X86ISD::BLENDI:
39788 // Integer ops.
39789 case X86ISD::AVG:
39790 case X86ISD::PACKSS:
39791 case X86ISD::PACKUS:
39792 // Horizontal Ops.
39793 case X86ISD::HADD:
39794 case X86ISD::HSUB:
39795 case X86ISD::FHADD:
39796 case X86ISD::FHSUB: {
39797 SDLoc DL(Op);
39798 SmallVector<SDValue, 4> Ops;
39799 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
39800 SDValue SrcOp = Op.getOperand(i);
39801 EVT SrcVT = SrcOp.getValueType();
39802 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39803, __extension__ __PRETTY_FUNCTION__))
39803 "Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39803, __extension__ __PRETTY_FUNCTION__))
;
39804 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
39805 ExtSizeInBits)
39806 : SrcOp);
39807 }
39808 MVT ExtVT = VT.getSimpleVT();
39809 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
39810 ExtSizeInBits / ExtVT.getScalarSizeInBits());
39811 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
39812 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
39813 SDValue Insert =
39814 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
39815 return TLO.CombineTo(Op, Insert);
39816 }
39817 }
39818 }
39819
39820 // Get target/faux shuffle mask.
39821 APInt OpUndef, OpZero;
39822 SmallVector<int, 64> OpMask;
39823 SmallVector<SDValue, 2> OpInputs;
39824 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
39825 OpZero, TLO.DAG, Depth, false))
39826 return false;
39827
39828 // Shuffle inputs must be the same size as the result.
39829 if (OpMask.size() != (unsigned)NumElts ||
39830 llvm::any_of(OpInputs, [VT](SDValue V) {
39831 return VT.getSizeInBits() != V.getValueSizeInBits() ||
39832 !V.getValueType().isVector();
39833 }))
39834 return false;
39835
39836 KnownZero = OpZero;
39837 KnownUndef = OpUndef;
39838
39839 // Check if shuffle mask can be simplified to undef/zero/identity.
39840 int NumSrcs = OpInputs.size();
39841 for (int i = 0; i != NumElts; ++i)
39842 if (!DemandedElts[i])
39843 OpMask[i] = SM_SentinelUndef;
39844
39845 if (isUndefInRange(OpMask, 0, NumElts)) {
39846 KnownUndef.setAllBits();
39847 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
39848 }
39849 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
39850 KnownZero.setAllBits();
39851 return TLO.CombineTo(
39852 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
39853 }
39854 for (int Src = 0; Src != NumSrcs; ++Src)
39855 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
39856 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
39857
39858 // Attempt to simplify inputs.
39859 for (int Src = 0; Src != NumSrcs; ++Src) {
39860 // TODO: Support inputs of different types.
39861 if (OpInputs[Src].getValueType() != VT)
39862 continue;
39863
39864 int Lo = Src * NumElts;
39865 APInt SrcElts = APInt::getNullValue(NumElts);
39866 for (int i = 0; i != NumElts; ++i)
39867 if (DemandedElts[i]) {
39868 int M = OpMask[i] - Lo;
39869 if (0 <= M && M < NumElts)
39870 SrcElts.setBit(M);
39871 }
39872
39873 // TODO - Propagate input undef/zero elts.
39874 APInt SrcUndef, SrcZero;
39875 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
39876 TLO, Depth + 1))
39877 return true;
39878 }
39879
39880 // If we don't demand all elements, then attempt to combine to a simpler
39881 // shuffle.
39882 // We need to convert the depth to something combineX86ShufflesRecursively
39883 // can handle - so pretend its Depth == 0 again, and reduce the max depth
39884 // to match. This prevents combineX86ShuffleChain from returning a
39885 // combined shuffle that's the same as the original root, causing an
39886 // infinite loop.
39887 if (!DemandedElts.isAllOnesValue()) {
39888 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39888, __extension__ __PRETTY_FUNCTION__))
;
39889
39890 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
39891 for (int i = 0; i != NumElts; ++i)
39892 if (DemandedElts[i])
39893 DemandedMask[i] = i;
39894
39895 SDValue NewShuffle = combineX86ShufflesRecursively(
39896 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
39897 /*HasVarMask*/ false,
39898 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
39899 Subtarget);
39900 if (NewShuffle)
39901 return TLO.CombineTo(Op, NewShuffle);
39902 }
39903
39904 return false;
39905}
39906
39907bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
39908 SDValue Op, const APInt &OriginalDemandedBits,
39909 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
39910 unsigned Depth) const {
39911 EVT VT = Op.getValueType();
39912 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
39913 unsigned Opc = Op.getOpcode();
39914 switch(Opc) {
39915 case X86ISD::VTRUNC: {
39916 KnownBits KnownOp;
39917 SDValue Src = Op.getOperand(0);
39918 MVT SrcVT = Src.getSimpleValueType();
39919
39920 // Simplify the input, using demanded bit information.
39921 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
39922 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
39923 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
39924 return true;
39925 break;
39926 }
39927 case X86ISD::PMULDQ:
39928 case X86ISD::PMULUDQ: {
39929 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
39930 KnownBits KnownOp;
39931 SDValue LHS = Op.getOperand(0);
39932 SDValue RHS = Op.getOperand(1);
39933 // FIXME: Can we bound this better?
39934 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
39935 if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
39936 TLO, Depth + 1))
39937 return true;
39938 if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
39939 TLO, Depth + 1))
39940 return true;
39941
39942 // Aggressively peek through ops to get at the demanded low bits.
39943 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
39944 LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39945 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
39946 RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
39947 if (DemandedLHS || DemandedRHS) {
39948 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
39949 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
39950 return TLO.CombineTo(
39951 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
39952 }
39953 break;
39954 }
39955 case X86ISD::VSHLI: {
39956 SDValue Op0 = Op.getOperand(0);
39957
39958 unsigned ShAmt = Op.getConstantOperandVal(1);
39959 if (ShAmt >= BitWidth)
39960 break;
39961
39962 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
39963
39964 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
39965 // single shift. We can do this if the bottom bits (which are shifted
39966 // out) are never demanded.
39967 if (Op0.getOpcode() == X86ISD::VSRLI &&
39968 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
39969 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
39970 if (Shift2Amt < BitWidth) {
39971 int Diff = ShAmt - Shift2Amt;
39972 if (Diff == 0)
39973 return TLO.CombineTo(Op, Op0.getOperand(0));
39974
39975 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
39976 SDValue NewShift = TLO.DAG.getNode(
39977 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
39978 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
39979 return TLO.CombineTo(Op, NewShift);
39980 }
39981 }
39982
39983 // If we are only demanding sign bits then we can use the shift source directly.
39984 unsigned NumSignBits =
39985 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
39986 unsigned UpperDemandedBits =
39987 BitWidth - OriginalDemandedBits.countTrailingZeros();
39988 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
39989 return TLO.CombineTo(Op, Op0);
39990
39991 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
39992 TLO, Depth + 1))
39993 return true;
39994
39995 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39995, __extension__ __PRETTY_FUNCTION__))
;
39996 Known.Zero <<= ShAmt;
39997 Known.One <<= ShAmt;
39998
39999 // Low bits known zero.
40000 Known.Zero.setLowBits(ShAmt);
40001 return false;
40002 }
40003 case X86ISD::VSRLI: {
40004 unsigned ShAmt = Op.getConstantOperandVal(1);
40005 if (ShAmt >= BitWidth)
40006 break;
40007
40008 APInt DemandedMask = OriginalDemandedBits << ShAmt;
40009
40010 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
40011 OriginalDemandedElts, Known, TLO, Depth + 1))
40012 return true;
40013
40014 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40014, __extension__ __PRETTY_FUNCTION__))
;
40015 Known.Zero.lshrInPlace(ShAmt);
40016 Known.One.lshrInPlace(ShAmt);
40017
40018 // High bits known zero.
40019 Known.Zero.setHighBits(ShAmt);
40020 return false;
40021 }
40022 case X86ISD::VSRAI: {
40023 SDValue Op0 = Op.getOperand(0);
40024 SDValue Op1 = Op.getOperand(1);
40025
40026 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
40027 if (ShAmt >= BitWidth)
40028 break;
40029
40030 APInt DemandedMask = OriginalDemandedBits << ShAmt;
40031
40032 // If we just want the sign bit then we don't need to shift it.
40033 if (OriginalDemandedBits.isSignMask())
40034 return TLO.CombineTo(Op, Op0);
40035
40036 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
40037 if (Op0.getOpcode() == X86ISD::VSHLI &&
40038 Op.getOperand(1) == Op0.getOperand(1)) {
40039 SDValue Op00 = Op0.getOperand(0);
40040 unsigned NumSignBits =
40041 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
40042 if (ShAmt < NumSignBits)
40043 return TLO.CombineTo(Op, Op00);
40044 }
40045
40046 // If any of the demanded bits are produced by the sign extension, we also
40047 // demand the input sign bit.
40048 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
40049 DemandedMask.setSignBit();
40050
40051 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
40052 TLO, Depth + 1))
40053 return true;
40054
40055 assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40055, __extension__ __PRETTY_FUNCTION__))
;
40056 Known.Zero.lshrInPlace(ShAmt);
40057 Known.One.lshrInPlace(ShAmt);
40058
40059 // If the input sign bit is known to be zero, or if none of the top bits
40060 // are demanded, turn this into an unsigned shift right.
40061 if (Known.Zero[BitWidth - ShAmt - 1] ||
40062 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
40063 return TLO.CombineTo(
40064 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
40065
40066 // High bits are known one.
40067 if (Known.One[BitWidth - ShAmt - 1])
40068 Known.One.setHighBits(ShAmt);
40069 return false;
40070 }
40071 case X86ISD::PEXTRB:
40072 case X86ISD::PEXTRW: {
40073 SDValue Vec = Op.getOperand(0);
40074 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
40075 MVT VecVT = Vec.getSimpleValueType();
40076 unsigned NumVecElts = VecVT.getVectorNumElements();
40077
40078 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
40079 unsigned Idx = CIdx->getZExtValue();
40080 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
40081
40082 // If we demand no bits from the vector then we must have demanded
40083 // bits from the implict zext - simplify to zero.
40084 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
40085 if (DemandedVecBits == 0)
40086 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
40087
40088 APInt KnownUndef, KnownZero;
40089 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
40090 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
40091 KnownZero, TLO, Depth + 1))
40092 return true;
40093
40094 KnownBits KnownVec;
40095 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
40096 KnownVec, TLO, Depth + 1))
40097 return true;
40098
40099 if (SDValue V = SimplifyMultipleUseDemandedBits(
40100 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
40101 return TLO.CombineTo(
40102 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
40103
40104 Known = KnownVec.zext(BitWidth);
40105 return false;
40106 }
40107 break;
40108 }
40109 case X86ISD::PINSRB:
40110 case X86ISD::PINSRW: {
40111 SDValue Vec = Op.getOperand(0);
40112 SDValue Scl = Op.getOperand(1);
40113 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
40114 MVT VecVT = Vec.getSimpleValueType();
40115
40116 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
40117 unsigned Idx = CIdx->getZExtValue();
40118 if (!OriginalDemandedElts[Idx])
40119 return TLO.CombineTo(Op, Vec);
40120
40121 KnownBits KnownVec;
40122 APInt DemandedVecElts(OriginalDemandedElts);
40123 DemandedVecElts.clearBit(Idx);
40124 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
40125 KnownVec, TLO, Depth + 1))
40126 return true;
40127
40128 KnownBits KnownScl;
40129 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
40130 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
40131 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
40132 return true;
40133
40134 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
40135 Known = KnownBits::commonBits(KnownVec, KnownScl);
40136 return false;
40137 }
40138 break;
40139 }
40140 case X86ISD::PACKSS:
40141 // PACKSS saturates to MIN/MAX integer values. So if we just want the
40142 // sign bit then we can just ask for the source operands sign bit.
40143 // TODO - add known bits handling.
40144 if (OriginalDemandedBits.isSignMask()) {
40145 APInt DemandedLHS, DemandedRHS;
40146 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
40147
40148 KnownBits KnownLHS, KnownRHS;
40149 APInt SignMask = APInt::getSignMask(BitWidth * 2);
40150 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
40151 KnownLHS, TLO, Depth + 1))
40152 return true;
40153 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
40154 KnownRHS, TLO, Depth + 1))
40155 return true;
40156
40157 // Attempt to avoid multi-use ops if we don't need anything from them.
40158 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
40159 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
40160 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
40161 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
40162 if (DemandedOp0 || DemandedOp1) {
40163 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
40164 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
40165 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
40166 }
40167 }
40168 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
40169 break;
40170 case X86ISD::VBROADCAST: {
40171 SDValue Src = Op.getOperand(0);
40172 MVT SrcVT = Src.getSimpleValueType();
40173 APInt DemandedElts = APInt::getOneBitSet(
40174 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
40175 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
40176 TLO, Depth + 1))
40177 return true;
40178 // If we don't need the upper bits, attempt to narrow the broadcast source.
40179 // Don't attempt this on AVX512 as it might affect broadcast folding.
40180 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
40181 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
40182 OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2)) {
40183 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
40184 SDValue NewSrc =
40185 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
40186 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
40187 SDValue NewBcst =
40188 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
40189 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
40190 }
40191 break;
40192 }
40193 case X86ISD::PCMPGT:
40194 // icmp sgt(0, R) == ashr(R, BitWidth-1).
40195 // iff we only need the sign bit then we can use R directly.
40196 if (OriginalDemandedBits.isSignMask() &&
40197 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
40198 return TLO.CombineTo(Op, Op.getOperand(1));
40199 break;
40200 case X86ISD::MOVMSK: {
40201 SDValue Src = Op.getOperand(0);
40202 MVT SrcVT = Src.getSimpleValueType();
40203 unsigned SrcBits = SrcVT.getScalarSizeInBits();
40204 unsigned NumElts = SrcVT.getVectorNumElements();
40205
40206 // If we don't need the sign bits at all just return zero.
40207 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
40208 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
40209
40210 // Only demand the vector elements of the sign bits we need.
40211 APInt KnownUndef, KnownZero;
40212 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
40213 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
40214 TLO, Depth + 1))
40215 return true;
40216
40217 Known.Zero = KnownZero.zextOrSelf(BitWidth);
40218 Known.Zero.setHighBits(BitWidth - NumElts);
40219
40220 // MOVMSK only uses the MSB from each vector element.
40221 KnownBits KnownSrc;
40222 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
40223 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
40224 Depth + 1))
40225 return true;
40226
40227 if (KnownSrc.One[SrcBits - 1])
40228 Known.One.setLowBits(NumElts);
40229 else if (KnownSrc.Zero[SrcBits - 1])
40230 Known.Zero.setLowBits(NumElts);
40231
40232 // Attempt to avoid multi-use os if we don't need anything from it.
40233 if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
40234 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
40235 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
40236 return false;
40237 }
40238 case X86ISD::BEXTR:
40239 case X86ISD::BEXTRI: {
40240 SDValue Op0 = Op.getOperand(0);
40241 SDValue Op1 = Op.getOperand(1);
40242
40243 // Only bottom 16-bits of the control bits are required.
40244 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
40245 // NOTE: SimplifyDemandedBits won't do this for constants.
40246 uint64_t Val1 = Cst1->getZExtValue();
40247 uint64_t MaskedVal1 = Val1 & 0xFFFF;
40248 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
40249 SDLoc DL(Op);
40250 return TLO.CombineTo(
40251 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
40252 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
40253 }
40254
40255 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
40256 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
40257
40258 // If the length is 0, the result is 0.
40259 if (Length == 0) {
40260 Known.setAllZero();
40261 return false;
40262 }
40263
40264 if ((Shift + Length) <= BitWidth) {
40265 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
40266 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
40267 return true;
40268
40269 Known = Known.extractBits(Length, Shift);
40270 Known = Known.zextOrTrunc(BitWidth);
40271 return false;
40272 }
40273 } else {
40274 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40274, __extension__ __PRETTY_FUNCTION__))
;
40275 KnownBits Known1;
40276 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
40277 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
40278 return true;
40279
40280 // If the length is 0, replace with 0.
40281 KnownBits LengthBits = Known1.extractBits(8, 8);
40282 if (LengthBits.isZero())
40283 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
40284 }
40285
40286 break;
40287 }
40288 case X86ISD::PDEP: {
40289 SDValue Op0 = Op.getOperand(0);
40290 SDValue Op1 = Op.getOperand(1);
40291
40292 unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();
40293 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
40294
40295 // If the demanded bits has leading zeroes, we don't demand those from the
40296 // mask.
40297 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
40298 return true;
40299
40300 // The number of possible 1s in the mask determines the number of LSBs of
40301 // operand 0 used. Undemanded bits from the mask don't matter so filter
40302 // them before counting.
40303 KnownBits Known2;
40304 uint64_t Count = (~Known.Zero & LoMask).countPopulation();
40305 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
40306 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
40307 return true;
40308
40309 // Zeroes are retained from the mask, but not ones.
40310 Known.One.clearAllBits();
40311 // The result will have at least as many trailing zeros as the non-mask
40312 // operand since bits can only map to the same or higher bit position.
40313 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
40314 return false;
40315 }
40316 }
40317
40318 return TargetLowering::SimplifyDemandedBitsForTargetNode(
40319 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
40320}
40321
40322SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
40323 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
40324 SelectionDAG &DAG, unsigned Depth) const {
40325 int NumElts = DemandedElts.getBitWidth();
40326 unsigned Opc = Op.getOpcode();
40327 EVT VT = Op.getValueType();
40328
40329 switch (Opc) {
40330 case X86ISD::PINSRB:
40331 case X86ISD::PINSRW: {
40332 // If we don't demand the inserted element, return the base vector.
40333 SDValue Vec = Op.getOperand(0);
40334 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
40335 MVT VecVT = Vec.getSimpleValueType();
40336 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
40337 !DemandedElts[CIdx->getZExtValue()])
40338 return Vec;
40339 break;
40340 }
40341 case X86ISD::VSHLI: {
40342 // If we are only demanding sign bits then we can use the shift source
40343 // directly.
40344 SDValue Op0 = Op.getOperand(0);
40345 unsigned ShAmt = Op.getConstantOperandVal(1);
40346 unsigned BitWidth = DemandedBits.getBitWidth();
40347 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
40348 unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
40349 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
40350 return Op0;
40351 break;
40352 }
40353 case X86ISD::VSRAI:
40354 // iff we only need the sign bit then we can use the source directly.
40355 // TODO: generalize where we only demand extended signbits.
40356 if (DemandedBits.isSignMask())
40357 return Op.getOperand(0);
40358 break;
40359 case X86ISD::PCMPGT:
40360 // icmp sgt(0, R) == ashr(R, BitWidth-1).
40361 // iff we only need the sign bit then we can use R directly.
40362 if (DemandedBits.isSignMask() &&
40363 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
40364 return Op.getOperand(1);
40365 break;
40366 }
40367
40368 APInt ShuffleUndef, ShuffleZero;
40369 SmallVector<int, 16> ShuffleMask;
40370 SmallVector<SDValue, 2> ShuffleOps;
40371 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
40372 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
40373 // If all the demanded elts are from one operand and are inline,
40374 // then we can use the operand directly.
40375 int NumOps = ShuffleOps.size();
40376 if (ShuffleMask.size() == (unsigned)NumElts &&
40377 llvm::all_of(ShuffleOps, [VT](SDValue V) {
40378 return VT.getSizeInBits() == V.getValueSizeInBits();
40379 })) {
40380
40381 if (DemandedElts.isSubsetOf(ShuffleUndef))
40382 return DAG.getUNDEF(VT);
40383 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
40384 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
40385
40386 // Bitmask that indicates which ops have only been accessed 'inline'.
40387 APInt IdentityOp = APInt::getAllOnesValue(NumOps);
40388 for (int i = 0; i != NumElts; ++i) {
40389 int M = ShuffleMask[i];
40390 if (!DemandedElts[i] || ShuffleUndef[i])
40391 continue;
40392 int OpIdx = M / NumElts;
40393 int EltIdx = M % NumElts;
40394 if (M < 0 || EltIdx != i) {
40395 IdentityOp.clearAllBits();
40396 break;
40397 }
40398 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
40399 if (IdentityOp == 0)
40400 break;
40401 }
40402 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40403, __extension__ __PRETTY_FUNCTION__))
40403 "Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40403, __extension__ __PRETTY_FUNCTION__))
;
40404
40405 if (IdentityOp != 0)
40406 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
40407 }
40408 }
40409
40410 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
40411 Op, DemandedBits, DemandedElts, DAG, Depth);
40412}
40413
40414// Helper to peek through bitops/trunc/setcc to determine size of source vector.
40415// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
40416static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
40417 bool AllowTruncate) {
40418 switch (Src.getOpcode()) {
40419 case ISD::TRUNCATE:
40420 if (!AllowTruncate)
40421 return false;
40422 LLVM_FALLTHROUGH[[gnu::fallthrough]];
40423 case ISD::SETCC:
40424 return Src.getOperand(0).getValueSizeInBits() == Size;
40425 case ISD::AND:
40426 case ISD::XOR:
40427 case ISD::OR:
40428 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
40429 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
40430 }
40431 return false;
40432}
40433
40434// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
40435static unsigned getAltBitOpcode(unsigned Opcode) {
40436 switch(Opcode) {
40437 case ISD::AND: return X86ISD::FAND;
40438 case ISD::OR: return X86ISD::FOR;
40439 case ISD::XOR: return X86ISD::FXOR;
40440 case X86ISD::ANDNP: return X86ISD::FANDN;
40441 }
40442 llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40442)
;
40443}
40444
40445// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
40446static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
40447 const SDLoc &DL) {
40448 EVT SrcVT = Src.getValueType();
40449 if (SrcVT != MVT::v4i1)
40450 return SDValue();
40451
40452 switch (Src.getOpcode()) {
40453 case ISD::SETCC:
40454 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
40455 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
40456 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
40457 SDValue Op0 = Src.getOperand(0);
40458 if (ISD::isNormalLoad(Op0.getNode()))
40459 return DAG.getBitcast(MVT::v4f32, Op0);
40460 if (Op0.getOpcode() == ISD::BITCAST &&
40461 Op0.getOperand(0).getValueType() == MVT::v4f32)
40462 return Op0.getOperand(0);
40463 }
40464 break;
40465 case ISD::AND:
40466 case ISD::XOR:
40467 case ISD::OR: {
40468 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
40469 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
40470 if (Op0 && Op1)
40471 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
40472 Op1);
40473 break;
40474 }
40475 }
40476 return SDValue();
40477}
40478
40479// Helper to push sign extension of vXi1 SETCC result through bitops.
40480static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
40481 SDValue Src, const SDLoc &DL) {
40482 switch (Src.getOpcode()) {
40483 case ISD::SETCC:
40484 case ISD::TRUNCATE:
40485 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
40486 case ISD::AND:
40487 case ISD::XOR:
40488 case ISD::OR:
40489 return DAG.getNode(
40490 Src.getOpcode(), DL, SExtVT,
40491 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
40492 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
40493 }
40494 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40494)
;
40495}
40496
40497// Try to match patterns such as
40498// (i16 bitcast (v16i1 x))
40499// ->
40500// (i16 movmsk (16i8 sext (v16i1 x)))
40501// before the illegal vector is scalarized on subtargets that don't have legal
40502// vxi1 types.
40503static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
40504 const SDLoc &DL,
40505 const X86Subtarget &Subtarget) {
40506 EVT SrcVT = Src.getValueType();
40507 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
40508 return SDValue();
40509
40510 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
40511 // legalization destroys the v4i32 type.
40512 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
40513 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
40514 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
40515 DAG.getBitcast(MVT::v4f32, V));
40516 return DAG.getZExtOrTrunc(V, DL, VT);
40517 }
40518 }
40519
40520 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
40521 // movmskb even with avx512. This will be better than truncating to vXi1 and
40522 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
40523 // vpcmpeqb/vpcmpgtb.
40524 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
40525 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
40526 Src.getOperand(0).getValueType() == MVT::v32i8 ||
40527 Src.getOperand(0).getValueType() == MVT::v64i8);
40528
40529 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
40530 // directly with vpmovmskb/vmovmskps/vmovmskpd.
40531 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
40532 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
40533 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
40534 EVT CmpVT = Src.getOperand(0).getValueType();
40535 EVT EltVT = CmpVT.getVectorElementType();
40536 if (CmpVT.getSizeInBits() <= 256 &&
40537 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
40538 PreferMovMsk = true;
40539 }
40540
40541 // With AVX512 vxi1 types are legal and we prefer using k-regs.
40542 // MOVMSK is supported in SSE2 or later.
40543 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
40544 return SDValue();
40545
40546 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
40547 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
40548 // v8i16 and v16i16.
40549 // For these two cases, we can shuffle the upper element bytes to a
40550 // consecutive sequence at the start of the vector and treat the results as
40551 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
40552 // for v16i16 this is not the case, because the shuffle is expensive, so we
40553 // avoid sign-extending to this type entirely.
40554 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
40555 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
40556 MVT SExtVT;
40557 bool PropagateSExt = false;
40558 switch (SrcVT.getSimpleVT().SimpleTy) {
40559 default:
40560 return SDValue();
40561 case MVT::v2i1:
40562 SExtVT = MVT::v2i64;
40563 break;
40564 case MVT::v4i1:
40565 SExtVT = MVT::v4i32;
40566 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
40567 // sign-extend to a 256-bit operation to avoid truncation.
40568 if (Subtarget.hasAVX() &&
40569 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
40570 SExtVT = MVT::v4i64;
40571 PropagateSExt = true;
40572 }
40573 break;
40574 case MVT::v8i1:
40575 SExtVT = MVT::v8i16;
40576 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
40577 // sign-extend to a 256-bit operation to match the compare.
40578 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
40579 // 256-bit because the shuffle is cheaper than sign extending the result of
40580 // the compare.
40581 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
40582 checkBitcastSrcVectorSize(Src, 512, true))) {
40583 SExtVT = MVT::v8i32;
40584 PropagateSExt = true;
40585 }
40586 break;
40587 case MVT::v16i1:
40588 SExtVT = MVT::v16i8;
40589 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
40590 // it is not profitable to sign-extend to 256-bit because this will
40591 // require an extra cross-lane shuffle which is more expensive than
40592 // truncating the result of the compare to 128-bits.
40593 break;
40594 case MVT::v32i1:
40595 SExtVT = MVT::v32i8;
40596 break;
40597 case MVT::v64i1:
40598 // If we have AVX512F, but not AVX512BW and the input is truncated from
40599 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
40600 if (Subtarget.hasAVX512()) {
40601 if (Subtarget.hasBWI())
40602 return SDValue();
40603 SExtVT = MVT::v64i8;
40604 break;
40605 }
40606 // Split if this is a <64 x i8> comparison result.
40607 if (checkBitcastSrcVectorSize(Src, 512, false)) {
40608 SExtVT = MVT::v64i8;
40609 break;
40610 }
40611 return SDValue();
40612 };
40613
40614 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
40615 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
40616
40617 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
40618 V = getPMOVMSKB(DL, V, DAG, Subtarget);
40619 } else {
40620 if (SExtVT == MVT::v8i16)
40621 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
40622 DAG.getUNDEF(MVT::v8i16));
40623 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
40624 }
40625
40626 EVT IntVT =
40627 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
40628 V = DAG.getZExtOrTrunc(V, DL, IntVT);
40629 return DAG.getBitcast(VT, V);
40630}
40631
40632// Convert a vXi1 constant build vector to the same width scalar integer.
40633static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
40634 EVT SrcVT = Op.getValueType();
40635 assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40636, __extension__ __PRETTY_FUNCTION__))
40636 "Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40636, __extension__ __PRETTY_FUNCTION__))
;
40637 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40638, __extension__ __PRETTY_FUNCTION__))
40638 "Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40638, __extension__ __PRETTY_FUNCTION__))
;
40639
40640 APInt Imm(SrcVT.getVectorNumElements(), 0);
40641 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
40642 SDValue In = Op.getOperand(Idx);
40643 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
40644 Imm.setBit(Idx);
40645 }
40646 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
40647 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
40648}
40649
40650static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
40651 TargetLowering::DAGCombinerInfo &DCI,
40652 const X86Subtarget &Subtarget) {
40653 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40653, __extension__ __PRETTY_FUNCTION__))
;
40654
40655 if (!DCI.isBeforeLegalizeOps())
40656 return SDValue();
40657
40658 // Only do this if we have k-registers.
40659 if (!Subtarget.hasAVX512())
40660 return SDValue();
40661
40662 EVT DstVT = N->getValueType(0);
40663 SDValue Op = N->getOperand(0);
40664 EVT SrcVT = Op.getValueType();
40665
40666 if (!Op.hasOneUse())
40667 return SDValue();
40668
40669 // Look for logic ops.
40670 if (Op.getOpcode() != ISD::AND &&
40671 Op.getOpcode() != ISD::OR &&
40672 Op.getOpcode() != ISD::XOR)
40673 return SDValue();
40674
40675 // Make sure we have a bitcast between mask registers and a scalar type.
40676 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
40677 DstVT.isScalarInteger()) &&
40678 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
40679 SrcVT.isScalarInteger()))
40680 return SDValue();
40681
40682 SDValue LHS = Op.getOperand(0);
40683 SDValue RHS = Op.getOperand(1);
40684
40685 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
40686 LHS.getOperand(0).getValueType() == DstVT)
40687 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
40688 DAG.getBitcast(DstVT, RHS));
40689
40690 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
40691 RHS.getOperand(0).getValueType() == DstVT)
40692 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40693 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
40694
40695 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
40696 // Most of these have to move a constant from the scalar domain anyway.
40697 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
40698 RHS = combinevXi1ConstantToInteger(RHS, DAG);
40699 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
40700 DAG.getBitcast(DstVT, LHS), RHS);
40701 }
40702
40703 return SDValue();
40704}
40705
40706static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
40707 const X86Subtarget &Subtarget) {
40708 SDLoc DL(BV);
40709 unsigned NumElts = BV->getNumOperands();
40710 SDValue Splat = BV->getSplatValue();
40711
40712 // Build MMX element from integer GPR or SSE float values.
40713 auto CreateMMXElement = [&](SDValue V) {
40714 if (V.isUndef())
40715 return DAG.getUNDEF(MVT::x86mmx);
40716 if (V.getValueType().isFloatingPoint()) {
40717 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
40718 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
40719 V = DAG.getBitcast(MVT::v2i64, V);
40720 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
40721 }
40722 V = DAG.getBitcast(MVT::i32, V);
40723 } else {
40724 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
40725 }
40726 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
40727 };
40728
40729 // Convert build vector ops to MMX data in the bottom elements.
40730 SmallVector<SDValue, 8> Ops;
40731
40732 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40733
40734 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
40735 if (Splat) {
40736 if (Splat.isUndef())
40737 return DAG.getUNDEF(MVT::x86mmx);
40738
40739 Splat = CreateMMXElement(Splat);
40740
40741 if (Subtarget.hasSSE1()) {
40742 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
40743 if (NumElts == 8)
40744 Splat = DAG.getNode(
40745 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40746 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
40747 TLI.getPointerTy(DAG.getDataLayout())),
40748 Splat, Splat);
40749
40750 // Use PSHUFW to repeat 16-bit elements.
40751 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
40752 return DAG.getNode(
40753 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
40754 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
40755 TLI.getPointerTy(DAG.getDataLayout())),
40756 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
40757 }
40758 Ops.append(NumElts, Splat);
40759 } else {
40760 for (unsigned i = 0; i != NumElts; ++i)
40761 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
40762 }
40763
40764 // Use tree of PUNPCKLs to build up general MMX vector.
40765 while (Ops.size() > 1) {
40766 unsigned NumOps = Ops.size();
40767 unsigned IntrinOp =
40768 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
40769 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
40770 : Intrinsic::x86_mmx_punpcklbw));
40771 SDValue Intrin = DAG.getTargetConstant(
40772 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
40773 for (unsigned i = 0; i != NumOps; i += 2)
40774 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
40775 Ops[i], Ops[i + 1]);
40776 Ops.resize(NumOps / 2);
40777 }
40778
40779 return Ops[0];
40780}
40781
40782// Recursive function that attempts to find if a bool vector node was originally
40783// a vector/float/double that got truncated/extended/bitcast to/from a scalar
40784// integer. If so, replace the scalar ops with bool vector equivalents back down
40785// the chain.
40786static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
40787 SelectionDAG &DAG,
40788 const X86Subtarget &Subtarget) {
40789 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40790 unsigned Opc = V.getOpcode();
40791 switch (Opc) {
40792 case ISD::BITCAST: {
40793 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
40794 SDValue Src = V.getOperand(0);
40795 EVT SrcVT = Src.getValueType();
40796 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
40797 return DAG.getBitcast(VT, Src);
40798 break;
40799 }
40800 case ISD::TRUNCATE: {
40801 // If we find a suitable source, a truncated scalar becomes a subvector.
40802 SDValue Src = V.getOperand(0);
40803 EVT NewSrcVT =
40804 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
40805 if (TLI.isTypeLegal(NewSrcVT))
40806 if (SDValue N0 =
40807 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40808 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
40809 DAG.getIntPtrConstant(0, DL));
40810 break;
40811 }
40812 case ISD::ANY_EXTEND:
40813 case ISD::ZERO_EXTEND: {
40814 // If we find a suitable source, an extended scalar becomes a subvector.
40815 SDValue Src = V.getOperand(0);
40816 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
40817 Src.getScalarValueSizeInBits());
40818 if (TLI.isTypeLegal(NewSrcVT))
40819 if (SDValue N0 =
40820 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
40821 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40822 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
40823 : DAG.getConstant(0, DL, VT),
40824 N0, DAG.getIntPtrConstant(0, DL));
40825 break;
40826 }
40827 case ISD::OR: {
40828 // If we find suitable sources, we can just move an OR to the vector domain.
40829 SDValue Src0 = V.getOperand(0);
40830 SDValue Src1 = V.getOperand(1);
40831 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40832 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
40833 return DAG.getNode(Opc, DL, VT, N0, N1);
40834 break;
40835 }
40836 case ISD::SHL: {
40837 // If we find a suitable source, a SHL becomes a KSHIFTL.
40838 SDValue Src0 = V.getOperand(0);
40839 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
40840 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
40841 break;
40842
40843 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
40844 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
40845 return DAG.getNode(
40846 X86ISD::KSHIFTL, DL, VT, N0,
40847 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
40848 break;
40849 }
40850 }
40851 return SDValue();
40852}
40853
40854static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
40855 TargetLowering::DAGCombinerInfo &DCI,
40856 const X86Subtarget &Subtarget) {
40857 SDValue N0 = N->getOperand(0);
40858 EVT VT = N->getValueType(0);
40859 EVT SrcVT = N0.getValueType();
40860 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40861
40862 // Try to match patterns such as
40863 // (i16 bitcast (v16i1 x))
40864 // ->
40865 // (i16 movmsk (16i8 sext (v16i1 x)))
40866 // before the setcc result is scalarized on subtargets that don't have legal
40867 // vxi1 types.
40868 if (DCI.isBeforeLegalize()) {
40869 SDLoc dl(N);
40870 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
40871 return V;
40872
40873 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40874 // type, widen both sides to avoid a trip through memory.
40875 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
40876 Subtarget.hasAVX512()) {
40877 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
40878 N0 = DAG.getBitcast(MVT::v8i1, N0);
40879 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
40880 DAG.getIntPtrConstant(0, dl));
40881 }
40882
40883 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
40884 // type, widen both sides to avoid a trip through memory.
40885 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
40886 Subtarget.hasAVX512()) {
40887 // Use zeros for the widening if we already have some zeroes. This can
40888 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
40889 // stream of this.
40890 // FIXME: It might make sense to detect a concat_vectors with a mix of
40891 // zeroes and undef and turn it into insert_subvector for i1 vectors as
40892 // a separate combine. What we can't do is canonicalize the operands of
40893 // such a concat or we'll get into a loop with SimplifyDemandedBits.
40894 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
40895 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
40896 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
40897 SrcVT = LastOp.getValueType();
40898 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40899 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
40900 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
40901 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40902 N0 = DAG.getBitcast(MVT::i8, N0);
40903 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40904 }
40905 }
40906
40907 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
40908 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
40909 Ops[0] = N0;
40910 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
40911 N0 = DAG.getBitcast(MVT::i8, N0);
40912 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
40913 }
40914 } else {
40915 // If we're bitcasting from iX to vXi1, see if the integer originally
40916 // began as a vXi1 and whether we can remove the bitcast entirely.
40917 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
40918 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
40919 if (SDValue V =
40920 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
40921 return V;
40922 }
40923 }
40924
40925 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
40926 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
40927 // due to insert_subvector legalization on KNL. By promoting the copy to i16
40928 // we can help with known bits propagation from the vXi1 domain to the
40929 // scalar domain.
40930 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
40931 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40932 N0.getOperand(0).getValueType() == MVT::v16i1 &&
40933 isNullConstant(N0.getOperand(1)))
40934 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
40935 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
40936
40937 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
40938 // and the vbroadcast_load are both integer or both fp. In some cases this
40939 // will remove the bitcast entirely.
40940 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
40941 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
40942 auto *BCast = cast<MemIntrinsicSDNode>(N0);
40943 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
40944 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
40945 // Don't swap i8/i16 since don't have fp types that size.
40946 if (MemSize >= 32) {
40947 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
40948 : MVT::getIntegerVT(MemSize);
40949 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
40950 : MVT::getIntegerVT(SrcVTSize);
40951 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
40952
40953 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
40954 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
40955 SDValue ResNode =
40956 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
40957 MemVT, BCast->getMemOperand());
40958 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
40959 return DAG.getBitcast(VT, ResNode);
40960 }
40961 }
40962
40963 // Since MMX types are special and don't usually play with other vector types,
40964 // it's better to handle them early to be sure we emit efficient code by
40965 // avoiding store-load conversions.
40966 if (VT == MVT::x86mmx) {
40967 // Detect MMX constant vectors.
40968 APInt UndefElts;
40969 SmallVector<APInt, 1> EltBits;
40970 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
40971 SDLoc DL(N0);
40972 // Handle zero-extension of i32 with MOVD.
40973 if (EltBits[0].countLeadingZeros() >= 32)
40974 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
40975 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
40976 // Else, bitcast to a double.
40977 // TODO - investigate supporting sext 32-bit immediates on x86_64.
40978 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
40979 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
40980 }
40981
40982 // Detect bitcasts to x86mmx low word.
40983 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
40984 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
40985 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
40986 bool LowUndef = true, AllUndefOrZero = true;
40987 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
40988 SDValue Op = N0.getOperand(i);
40989 LowUndef &= Op.isUndef() || (i >= e/2);
40990 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
40991 }
40992 if (AllUndefOrZero) {
40993 SDValue N00 = N0.getOperand(0);
40994 SDLoc dl(N00);
40995 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
40996 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
40997 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
40998 }
40999 }
41000
41001 // Detect bitcasts of 64-bit build vectors and convert to a
41002 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
41003 // lowest element.
41004 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
41005 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
41006 SrcVT == MVT::v8i8))
41007 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
41008
41009 // Detect bitcasts between element or subvector extraction to x86mmx.
41010 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
41011 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
41012 isNullConstant(N0.getOperand(1))) {
41013 SDValue N00 = N0.getOperand(0);
41014 if (N00.getValueType().is128BitVector())
41015 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
41016 DAG.getBitcast(MVT::v2i64, N00));
41017 }
41018
41019 // Detect bitcasts from FP_TO_SINT to x86mmx.
41020 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
41021 SDLoc DL(N0);
41022 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
41023 DAG.getUNDEF(MVT::v2i32));
41024 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
41025 DAG.getBitcast(MVT::v2i64, Res));
41026 }
41027 }
41028
41029 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
41030 // most of these to scalar anyway.
41031 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
41032 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
41033 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
41034 return combinevXi1ConstantToInteger(N0, DAG);
41035 }
41036
41037 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
41038 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
41039 isa<ConstantSDNode>(N0)) {
41040 auto *C = cast<ConstantSDNode>(N0);
41041 if (C->isAllOnesValue())
41042 return DAG.getConstant(1, SDLoc(N0), VT);
41043 if (C->isNullValue())
41044 return DAG.getConstant(0, SDLoc(N0), VT);
41045 }
41046
41047 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
41048 // Turn it into a sign bit compare that produces a k-register. This avoids
41049 // a trip through a GPR.
41050 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
41051 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
41052 isPowerOf2_32(VT.getVectorNumElements())) {
41053 unsigned NumElts = VT.getVectorNumElements();
41054 SDValue Src = N0;
41055
41056 // Peek through truncate.
41057 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
41058 Src = N0.getOperand(0);
41059
41060 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
41061 SDValue MovmskIn = Src.getOperand(0);
41062 MVT MovmskVT = MovmskIn.getSimpleValueType();
41063 unsigned MovMskElts = MovmskVT.getVectorNumElements();
41064
41065 // We allow extra bits of the movmsk to be used since they are known zero.
41066 // We can't convert a VPMOVMSKB without avx512bw.
41067 if (MovMskElts <= NumElts &&
41068 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
41069 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
41070 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
41071 SDLoc dl(N);
41072 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
41073 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
41074 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
41075 if (EVT(CmpVT) == VT)
41076 return Cmp;
41077
41078 // Pad with zeroes up to original VT to replace the zeroes that were
41079 // being used from the MOVMSK.
41080 unsigned NumConcats = NumElts / MovMskElts;
41081 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
41082 Ops[0] = Cmp;
41083 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
41084 }
41085 }
41086 }
41087
41088 // Try to remove bitcasts from input and output of mask arithmetic to
41089 // remove GPR<->K-register crossings.
41090 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
41091 return V;
41092
41093 // Convert a bitcasted integer logic operation that has one bitcasted
41094 // floating-point operand into a floating-point logic operation. This may
41095 // create a load of a constant, but that is cheaper than materializing the
41096 // constant in an integer register and transferring it to an SSE register or
41097 // transferring the SSE operand to integer register and back.
41098 unsigned FPOpcode;
41099 switch (N0.getOpcode()) {
41100 case ISD::AND: FPOpcode = X86ISD::FAND; break;
41101 case ISD::OR: FPOpcode = X86ISD::FOR; break;
41102 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
41103 default: return SDValue();
41104 }
41105
41106 // Check if we have a bitcast from another integer type as well.
41107 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
41108 (Subtarget.hasSSE2() && VT == MVT::f64) ||
41109 (Subtarget.hasFP16() && VT == MVT::f16) ||
41110 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
41111 TLI.isTypeLegal(VT))))
41112 return SDValue();
41113
41114 SDValue LogicOp0 = N0.getOperand(0);
41115 SDValue LogicOp1 = N0.getOperand(1);
41116 SDLoc DL0(N0);
41117
41118 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
41119 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
41120 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
41121 LogicOp0.getOperand(0).getValueType() == VT &&
41122 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
41123 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
41124 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
41125 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
41126 }
41127 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
41128 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
41129 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
41130 LogicOp1.getOperand(0).getValueType() == VT &&
41131 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
41132 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
41133 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
41134 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
41135 }
41136
41137 return SDValue();
41138}
41139
41140// Given a ABS node, detect the following pattern:
41141// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
41142// This is useful as it is the input into a SAD pattern.
41143static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
41144 SDValue AbsOp1 = Abs->getOperand(0);
41145 if (AbsOp1.getOpcode() != ISD::SUB)
41146 return false;
41147
41148 Op0 = AbsOp1.getOperand(0);
41149 Op1 = AbsOp1.getOperand(1);
41150
41151 // Check if the operands of the sub are zero-extended from vectors of i8.
41152 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
41153 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
41154 Op1.getOpcode() != ISD::ZERO_EXTEND ||
41155 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
41156 return false;
41157
41158 return true;
41159}
41160
41161// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
41162// to these zexts.
41163static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
41164 const SDValue &Zext1, const SDLoc &DL,
41165 const X86Subtarget &Subtarget) {
41166 // Find the appropriate width for the PSADBW.
41167 EVT InVT = Zext0.getOperand(0).getValueType();
41168 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
41169
41170 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
41171 // fill in the missing vector elements with 0.
41172 unsigned NumConcat = RegSize / InVT.getSizeInBits();
41173 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
41174 Ops[0] = Zext0.getOperand(0);
41175 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
41176 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
41177 Ops[0] = Zext1.getOperand(0);
41178 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
41179
41180 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
41181 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
41182 ArrayRef<SDValue> Ops) {
41183 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
41184 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
41185 };
41186 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
41187 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
41188 PSADBWBuilder);
41189}
41190
41191// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
41192// PHMINPOSUW.
41193static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
41194 const X86Subtarget &Subtarget) {
41195 // Bail without SSE41.
41196 if (!Subtarget.hasSSE41())
41197 return SDValue();
41198
41199 EVT ExtractVT = Extract->getValueType(0);
41200 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
41201 return SDValue();
41202
41203 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
41204 ISD::NodeType BinOp;
41205 SDValue Src = DAG.matchBinOpReduction(
41206 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
41207 if (!Src)
41208 return SDValue();
41209
41210 EVT SrcVT = Src.getValueType();
41211 EVT SrcSVT = SrcVT.getScalarType();
41212 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
41213 return SDValue();
41214
41215 SDLoc DL(Extract);
41216 SDValue MinPos = Src;
41217
41218 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
41219 while (SrcVT.getSizeInBits() > 128) {
41220 SDValue Lo, Hi;
41221 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
41222 SrcVT = Lo.getValueType();
41223 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
41224 }
41225 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41227, __extension__ __PRETTY_FUNCTION__))
41226 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41227, __extension__ __PRETTY_FUNCTION__))
41227 "Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41227, __extension__ __PRETTY_FUNCTION__))
;
41228
41229 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
41230 // to flip the value accordingly.
41231 SDValue Mask;
41232 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
41233 if (BinOp == ISD::SMAX)
41234 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
41235 else if (BinOp == ISD::SMIN)
41236 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
41237 else if (BinOp == ISD::UMAX)
41238 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
41239
41240 if (Mask)
41241 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
41242
41243 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
41244 // shuffling each upper element down and insert zeros. This means that the
41245 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
41246 // ready for the PHMINPOS.
41247 if (ExtractVT == MVT::i8) {
41248 SDValue Upper = DAG.getVectorShuffle(
41249 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
41250 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
41251 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
41252 }
41253
41254 // Perform the PHMINPOS on a v8i16 vector,
41255 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
41256 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
41257 MinPos = DAG.getBitcast(SrcVT, MinPos);
41258
41259 if (Mask)
41260 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
41261
41262 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
41263 DAG.getIntPtrConstant(0, DL));
41264}
41265
41266// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
41267static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
41268 const X86Subtarget &Subtarget) {
41269 // Bail without SSE2.
41270 if (!Subtarget.hasSSE2())
41271 return SDValue();
41272
41273 EVT ExtractVT = Extract->getValueType(0);
41274 unsigned BitWidth = ExtractVT.getSizeInBits();
41275 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
41276 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
41277 return SDValue();
41278
41279 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
41280 ISD::NodeType BinOp;
41281 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
41282 if (!Match && ExtractVT == MVT::i1)
41283 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
41284 if (!Match)
41285 return SDValue();
41286
41287 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
41288 // which we can't support here for now.
41289 if (Match.getScalarValueSizeInBits() != BitWidth)
41290 return SDValue();
41291
41292 SDValue Movmsk;
41293 SDLoc DL(Extract);
41294 EVT MatchVT = Match.getValueType();
41295 unsigned NumElts = MatchVT.getVectorNumElements();
41296 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
41297 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41298
41299 if (ExtractVT == MVT::i1) {
41300 // Special case for (pre-legalization) vXi1 reductions.
41301 if (NumElts > 64 || !isPowerOf2_32(NumElts))
41302 return SDValue();
41303 if (TLI.isTypeLegal(MatchVT)) {
41304 // If this is a legal AVX512 predicate type then we can just bitcast.
41305 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
41306 Movmsk = DAG.getBitcast(MovmskVT, Match);
41307 } else {
41308 // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
41309 // PCMPEQQ (SSE41+), use PCMPEQD instead.
41310 if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
41311 Match.getOpcode() == ISD::SETCC &&
41312 ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
41313 cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
41314 ISD::CondCode::SETEQ) {
41315 SDValue Vec = Match.getOperand(0);
41316 if (Vec.getValueType().getScalarType() == MVT::i64 &&
41317 (2 * NumElts) <= MaxElts) {
41318 NumElts *= 2;
41319 EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
41320 MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
41321 Match = DAG.getSetCC(
41322 DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
41323 DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
41324 }
41325 }
41326
41327 // Use combineBitcastvxi1 to create the MOVMSK.
41328 while (NumElts > MaxElts) {
41329 SDValue Lo, Hi;
41330 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
41331 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
41332 NumElts /= 2;
41333 }
41334 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
41335 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
41336 }
41337 if (!Movmsk)
41338 return SDValue();
41339 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
41340 } else {
41341 // FIXME: Better handling of k-registers or 512-bit vectors?
41342 unsigned MatchSizeInBits = Match.getValueSizeInBits();
41343 if (!(MatchSizeInBits == 128 ||
41344 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
41345 return SDValue();
41346
41347 // Make sure this isn't a vector of 1 element. The perf win from using
41348 // MOVMSK diminishes with less elements in the reduction, but it is
41349 // generally better to get the comparison over to the GPRs as soon as
41350 // possible to reduce the number of vector ops.
41351 if (Match.getValueType().getVectorNumElements() < 2)
41352 return SDValue();
41353
41354 // Check that we are extracting a reduction of all sign bits.
41355 if (DAG.ComputeNumSignBits(Match) != BitWidth)
41356 return SDValue();
41357
41358 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
41359 SDValue Lo, Hi;
41360 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
41361 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
41362 MatchSizeInBits = Match.getValueSizeInBits();
41363 }
41364
41365 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
41366 MVT MaskSrcVT;
41367 if (64 == BitWidth || 32 == BitWidth)
41368 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
41369 MatchSizeInBits / BitWidth);
41370 else
41371 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
41372
41373 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
41374 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
41375 NumElts = MaskSrcVT.getVectorNumElements();
41376 }
41377 assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41378, __extension__ __PRETTY_FUNCTION__))
41378 "Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41378, __extension__ __PRETTY_FUNCTION__))
;
41379
41380 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
41381 if (BinOp == ISD::XOR) {
41382 // parity -> (PARITY(MOVMSK X))
41383 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
41384 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
41385 }
41386
41387 SDValue CmpC;
41388 ISD::CondCode CondCode;
41389 if (BinOp == ISD::OR) {
41390 // any_of -> MOVMSK != 0
41391 CmpC = DAG.getConstant(0, DL, CmpVT);
41392 CondCode = ISD::CondCode::SETNE;
41393 } else {
41394 // all_of -> MOVMSK == ((1 << NumElts) - 1)
41395 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
41396 DL, CmpVT);
41397 CondCode = ISD::CondCode::SETEQ;
41398 }
41399
41400 // The setcc produces an i8 of 0/1, so extend that to the result width and
41401 // negate to get the final 0/-1 mask value.
41402 EVT SetccVT =
41403 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
41404 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
41405 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
41406 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
41407 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
41408}
41409
41410static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
41411 const X86Subtarget &Subtarget) {
41412 // PSADBW is only supported on SSE2 and up.
41413 if (!Subtarget.hasSSE2())
41414 return SDValue();
41415
41416 EVT ExtractVT = Extract->getValueType(0);
41417 // Verify the type we're extracting is either i32 or i64.
41418 // FIXME: Could support other types, but this is what we have coverage for.
41419 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
41420 return SDValue();
41421
41422 EVT VT = Extract->getOperand(0).getValueType();
41423 if (!isPowerOf2_32(VT.getVectorNumElements()))
41424 return SDValue();
41425
41426 // Match shuffle + add pyramid.
41427 ISD::NodeType BinOp;
41428 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
41429
41430 // The operand is expected to be zero extended from i8
41431 // (verified in detectZextAbsDiff).
41432 // In order to convert to i64 and above, additional any/zero/sign
41433 // extend is expected.
41434 // The zero extend from 32 bit has no mathematical effect on the result.
41435 // Also the sign extend is basically zero extend
41436 // (extends the sign bit which is zero).
41437 // So it is correct to skip the sign/zero extend instruction.
41438 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
41439 Root.getOpcode() == ISD::ZERO_EXTEND ||
41440 Root.getOpcode() == ISD::ANY_EXTEND))
41441 Root = Root.getOperand(0);
41442
41443 // If there was a match, we want Root to be a select that is the root of an
41444 // abs-diff pattern.
41445 if (!Root || Root.getOpcode() != ISD::ABS)
41446 return SDValue();
41447
41448 // Check whether we have an abs-diff pattern feeding into the select.
41449 SDValue Zext0, Zext1;
41450 if (!detectZextAbsDiff(Root, Zext0, Zext1))
41451 return SDValue();
41452
41453 // Create the SAD instruction.
41454 SDLoc DL(Extract);
41455 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
41456
41457 // If the original vector was wider than 8 elements, sum over the results
41458 // in the SAD vector.
41459 unsigned Stages = Log2_32(VT.getVectorNumElements());
41460 EVT SadVT = SAD.getValueType();
41461 if (Stages > 3) {
41462 unsigned SadElems = SadVT.getVectorNumElements();
41463
41464 for(unsigned i = Stages - 3; i > 0; --i) {
41465 SmallVector<int, 16> Mask(SadElems, -1);
41466 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
41467 Mask[j] = MaskEnd + j;
41468
41469 SDValue Shuffle =
41470 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
41471 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
41472 }
41473 }
41474
41475 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
41476 // Return the lowest ExtractSizeInBits bits.
41477 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
41478 SadVT.getSizeInBits() / ExtractSizeInBits);
41479 SAD = DAG.getBitcast(ResVT, SAD);
41480 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
41481 Extract->getOperand(1));
41482}
41483
41484// Attempt to peek through a target shuffle and extract the scalar from the
41485// source.
41486static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
41487 TargetLowering::DAGCombinerInfo &DCI,
41488 const X86Subtarget &Subtarget) {
41489 if (DCI.isBeforeLegalizeOps())
41490 return SDValue();
41491
41492 SDLoc dl(N);
41493 SDValue Src = N->getOperand(0);
41494 SDValue Idx = N->getOperand(1);
41495
41496 EVT VT = N->getValueType(0);
41497 EVT SrcVT = Src.getValueType();
41498 EVT SrcSVT = SrcVT.getVectorElementType();
41499 unsigned SrcEltBits = SrcSVT.getSizeInBits();
41500 unsigned NumSrcElts = SrcVT.getVectorNumElements();
41501
41502 // Don't attempt this for boolean mask vectors or unknown extraction indices.
41503 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
41504 return SDValue();
41505
41506 const APInt &IdxC = N->getConstantOperandAPInt(1);
41507 if (IdxC.uge(NumSrcElts))
41508 return SDValue();
41509
41510 SDValue SrcBC = peekThroughBitcasts(Src);
41511
41512 // Handle extract(bitcast(broadcast(scalar_value))).
41513 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
41514 SDValue SrcOp = SrcBC.getOperand(0);
41515 EVT SrcOpVT = SrcOp.getValueType();
41516 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
41517 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
41518 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
41519 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
41520 // TODO support non-zero offsets.
41521 if (Offset == 0) {
41522 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
41523 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
41524 return SrcOp;
41525 }
41526 }
41527 }
41528
41529 // If we're extracting a single element from a broadcast load and there are
41530 // no other users, just create a single load.
41531 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
41532 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
41533 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
41534 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
41535 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
41536 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
41537 MemIntr->getBasePtr(),
41538 MemIntr->getPointerInfo(),
41539 MemIntr->getOriginalAlign(),
41540 MemIntr->getMemOperand()->getFlags());
41541 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
41542 return Load;
41543 }
41544 }
41545
41546 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
41547 // TODO: Move to DAGCombine?
41548 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
41549 SrcBC.getValueType().isInteger() &&
41550 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
41551 SrcBC.getScalarValueSizeInBits() ==
41552 SrcBC.getOperand(0).getValueSizeInBits()) {
41553 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
41554 if (IdxC.ult(Scale)) {
41555 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
41556 SDValue Scl = SrcBC.getOperand(0);
41557 EVT SclVT = Scl.getValueType();
41558 if (Offset) {
41559 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
41560 DAG.getShiftAmountConstant(Offset, SclVT, dl));
41561 }
41562 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
41563 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
41564 return Scl;
41565 }
41566 }
41567
41568 // Handle extract(truncate(x)) for 0'th index.
41569 // TODO: Treat this as a faux shuffle?
41570 // TODO: When can we use this for general indices?
41571 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
41572 (SrcVT.getSizeInBits() % 128) == 0) {
41573 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
41574 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
41575 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
41576 Idx);
41577 }
41578
41579 // We can only legally extract other elements from 128-bit vectors and in
41580 // certain circumstances, depending on SSE-level.
41581 // TODO: Investigate float/double extraction if it will be just stored.
41582 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
41583 unsigned Idx) {
41584 EVT VecSVT = VecVT.getScalarType();
41585 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
41586 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
41587 VecSVT == MVT::i64)) {
41588 unsigned EltSizeInBits = VecSVT.getSizeInBits();
41589 unsigned NumEltsPerLane = 128 / EltSizeInBits;
41590 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
41591 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
41592 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
41593 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
41594 Idx &= (NumEltsPerLane - 1);
41595 }
41596 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
41597 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
41598 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
41599 DAG.getBitcast(VecVT, Vec),
41600 DAG.getIntPtrConstant(Idx, dl));
41601 }
41602 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
41603 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
41604 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
41605 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
41606 DAG.getTargetConstant(Idx, dl, MVT::i8));
41607 }
41608 return SDValue();
41609 };
41610
41611 // Resolve the target shuffle inputs and mask.
41612 SmallVector<int, 16> Mask;
41613 SmallVector<SDValue, 2> Ops;
41614 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
41615 return SDValue();
41616
41617 // Shuffle inputs must be the same size as the result.
41618 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
41619 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
41620 }))
41621 return SDValue();
41622
41623 // Attempt to narrow/widen the shuffle mask to the correct size.
41624 if (Mask.size() != NumSrcElts) {
41625 if ((NumSrcElts % Mask.size()) == 0) {
41626 SmallVector<int, 16> ScaledMask;
41627 int Scale = NumSrcElts / Mask.size();
41628 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
41629 Mask = std::move(ScaledMask);
41630 } else if ((Mask.size() % NumSrcElts) == 0) {
41631 // Simplify Mask based on demanded element.
41632 int ExtractIdx = (int)IdxC.getZExtValue();
41633 int Scale = Mask.size() / NumSrcElts;
41634 int Lo = Scale * ExtractIdx;
41635 int Hi = Scale * (ExtractIdx + 1);
41636 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
41637 if (i < Lo || Hi <= i)
41638 Mask[i] = SM_SentinelUndef;
41639
41640 SmallVector<int, 16> WidenedMask;
41641 while (Mask.size() > NumSrcElts &&
41642 canWidenShuffleElements(Mask, WidenedMask))
41643 Mask = std::move(WidenedMask);
41644 }
41645 }
41646
41647 // If narrowing/widening failed, see if we can extract+zero-extend.
41648 int ExtractIdx;
41649 EVT ExtractVT;
41650 if (Mask.size() == NumSrcElts) {
41651 ExtractIdx = Mask[IdxC.getZExtValue()];
41652 ExtractVT = SrcVT;
41653 } else {
41654 unsigned Scale = Mask.size() / NumSrcElts;
41655 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
41656 return SDValue();
41657 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
41658 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
41659 return SDValue();
41660 ExtractIdx = Mask[ScaledIdx];
41661 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
41662 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
41663 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41664, __extension__ __PRETTY_FUNCTION__))
41664 "Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41664, __extension__ __PRETTY_FUNCTION__))
;
41665 }
41666
41667 // If the shuffle source element is undef/zero then we can just accept it.
41668 if (ExtractIdx == SM_SentinelUndef)
41669 return DAG.getUNDEF(VT);
41670
41671 if (ExtractIdx == SM_SentinelZero)
41672 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
41673 : DAG.getConstant(0, dl, VT);
41674
41675 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
41676 ExtractIdx = ExtractIdx % Mask.size();
41677 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
41678 return DAG.getZExtOrTrunc(V, dl, VT);
41679
41680 return SDValue();
41681}
41682
41683/// Extracting a scalar FP value from vector element 0 is free, so extract each
41684/// operand first, then perform the math as a scalar op.
41685static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
41686 const X86Subtarget &Subtarget) {
41687 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41687, __extension__ __PRETTY_FUNCTION__))
;
41688 SDValue Vec = ExtElt->getOperand(0);
41689 SDValue Index = ExtElt->getOperand(1);
41690 EVT VT = ExtElt->getValueType(0);
41691 EVT VecVT = Vec.getValueType();
41692
41693 // TODO: If this is a unary/expensive/expand op, allow extraction from a
41694 // non-zero element because the shuffle+scalar op will be cheaper?
41695 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
41696 return SDValue();
41697
41698 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
41699 // extract, the condition code), so deal with those as a special-case.
41700 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
41701 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
41702 if (OpVT != MVT::f32 && OpVT != MVT::f64)
41703 return SDValue();
41704
41705 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
41706 SDLoc DL(ExtElt);
41707 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41708 Vec.getOperand(0), Index);
41709 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
41710 Vec.getOperand(1), Index);
41711 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
41712 }
41713
41714 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
41715 VT != MVT::f64)
41716 return SDValue();
41717
41718 // Vector FP selects don't fit the pattern of FP math ops (because the
41719 // condition has a different type and we have to change the opcode), so deal
41720 // with those here.
41721 // FIXME: This is restricted to pre type legalization by ensuring the setcc
41722 // has i1 elements. If we loosen this we need to convert vector bool to a
41723 // scalar bool.
41724 if (Vec.getOpcode() == ISD::VSELECT &&
41725 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
41726 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
41727 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
41728 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
41729 SDLoc DL(ExtElt);
41730 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
41731 Vec.getOperand(0).getValueType().getScalarType(),
41732 Vec.getOperand(0), Index);
41733 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41734 Vec.getOperand(1), Index);
41735 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
41736 Vec.getOperand(2), Index);
41737 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
41738 }
41739
41740 // TODO: This switch could include FNEG and the x86-specific FP logic ops
41741 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
41742 // missed load folding and fma+fneg combining.
41743 switch (Vec.getOpcode()) {
41744 case ISD::FMA: // Begin 3 operands
41745 case ISD::FMAD:
41746 case ISD::FADD: // Begin 2 operands
41747 case ISD::FSUB:
41748 case ISD::FMUL:
41749 case ISD::FDIV:
41750 case ISD::FREM:
41751 case ISD::FCOPYSIGN:
41752 case ISD::FMINNUM:
41753 case ISD::FMAXNUM:
41754 case ISD::FMINNUM_IEEE:
41755 case ISD::FMAXNUM_IEEE:
41756 case ISD::FMAXIMUM:
41757 case ISD::FMINIMUM:
41758 case X86ISD::FMAX:
41759 case X86ISD::FMIN:
41760 case ISD::FABS: // Begin 1 operand
41761 case ISD::FSQRT:
41762 case ISD::FRINT:
41763 case ISD::FCEIL:
41764 case ISD::FTRUNC:
41765 case ISD::FNEARBYINT:
41766 case ISD::FROUND:
41767 case ISD::FFLOOR:
41768 case X86ISD::FRCP:
41769 case X86ISD::FRSQRT: {
41770 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
41771 SDLoc DL(ExtElt);
41772 SmallVector<SDValue, 4> ExtOps;
41773 for (SDValue Op : Vec->ops())
41774 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
41775 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
41776 }
41777 default:
41778 return SDValue();
41779 }
41780 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41780)
;
41781}
41782
41783/// Try to convert a vector reduction sequence composed of binops and shuffles
41784/// into horizontal ops.
41785static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
41786 const X86Subtarget &Subtarget) {
41787 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41787, __extension__ __PRETTY_FUNCTION__))
;
41788
41789 // We need at least SSE2 to anything here.
41790 if (!Subtarget.hasSSE2())
41791 return SDValue();
41792
41793 ISD::NodeType Opc;
41794 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
41795 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
41796 if (!Rdx)
41797 return SDValue();
41798
41799 SDValue Index = ExtElt->getOperand(1);
41800 assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41801, __extension__ __PRETTY_FUNCTION__))
41801 "Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41801, __extension__ __PRETTY_FUNCTION__))
;
41802
41803 EVT VT = ExtElt->getValueType(0);
41804 EVT VecVT = Rdx.getValueType();
41805 if (VecVT.getScalarType() != VT)
41806 return SDValue();
41807
41808 SDLoc DL(ExtElt);
41809
41810 // vXi8 mul reduction - promote to vXi16 mul reduction.
41811 if (Opc == ISD::MUL) {
41812 unsigned NumElts = VecVT.getVectorNumElements();
41813 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
41814 return SDValue();
41815 if (VecVT.getSizeInBits() >= 128) {
41816 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
41817 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41818 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
41819 Lo = DAG.getBitcast(WideVT, Lo);
41820 Hi = DAG.getBitcast(WideVT, Hi);
41821 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
41822 while (Rdx.getValueSizeInBits() > 128) {
41823 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41824 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
41825 }
41826 } else {
41827 if (VecVT == MVT::v4i8)
41828 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41829 DAG.getUNDEF(MVT::v4i8));
41830 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41831 DAG.getUNDEF(MVT::v8i8));
41832 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
41833 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
41834 }
41835 if (NumElts >= 8)
41836 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41837 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41838 {4, 5, 6, 7, -1, -1, -1, -1}));
41839 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41840 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41841 {2, 3, -1, -1, -1, -1, -1, -1}));
41842 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
41843 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
41844 {1, -1, -1, -1, -1, -1, -1, -1}));
41845 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41846 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41847 }
41848
41849 // vXi8 add reduction - sub 128-bit vector.
41850 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
41851 if (VecVT == MVT::v4i8) {
41852 // Pad with zero.
41853 if (Subtarget.hasSSE41()) {
41854 Rdx = DAG.getBitcast(MVT::i32, Rdx);
41855 Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
41856 DAG.getConstant(0, DL, MVT::v4i32), Rdx,
41857 DAG.getIntPtrConstant(0, DL));
41858 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41859 } else {
41860 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
41861 DAG.getConstant(0, DL, VecVT));
41862 }
41863 }
41864 if (Rdx.getValueType() == MVT::v8i8) {
41865 // Pad with undef.
41866 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
41867 DAG.getUNDEF(MVT::v8i8));
41868 }
41869 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41870 DAG.getConstant(0, DL, MVT::v16i8));
41871 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41872 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41873 }
41874
41875 // Must be a >=128-bit vector with pow2 elements.
41876 if ((VecVT.getSizeInBits() % 128) != 0 ||
41877 !isPowerOf2_32(VecVT.getVectorNumElements()))
41878 return SDValue();
41879
41880 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
41881 if (VT == MVT::i8) {
41882 while (Rdx.getValueSizeInBits() > 128) {
41883 SDValue Lo, Hi;
41884 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
41885 VecVT = Lo.getValueType();
41886 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
41887 }
41888 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41888, __extension__ __PRETTY_FUNCTION__))
;
41889
41890 SDValue Hi = DAG.getVectorShuffle(
41891 MVT::v16i8, DL, Rdx, Rdx,
41892 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
41893 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
41894 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
41895 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
41896 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
41897 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41898 }
41899
41900 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
41901 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
41902 return SDValue();
41903
41904 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
41905
41906 // 256-bit horizontal instructions operate on 128-bit chunks rather than
41907 // across the whole vector, so we need an extract + hop preliminary stage.
41908 // This is the only step where the operands of the hop are not the same value.
41909 // TODO: We could extend this to handle 512-bit or even longer vectors.
41910 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
41911 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
41912 unsigned NumElts = VecVT.getVectorNumElements();
41913 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
41914 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
41915 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
41916 VecVT = Rdx.getValueType();
41917 }
41918 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
41919 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
41920 return SDValue();
41921
41922 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
41923 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
41924 for (unsigned i = 0; i != ReductionSteps; ++i)
41925 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
41926
41927 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
41928}
41929
41930/// Detect vector gather/scatter index generation and convert it from being a
41931/// bunch of shuffles and extracts into a somewhat faster sequence.
41932/// For i686, the best sequence is apparently storing the value and loading
41933/// scalars back, while for x64 we should use 64-bit extracts and shifts.
41934static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
41935 TargetLowering::DAGCombinerInfo &DCI,
41936 const X86Subtarget &Subtarget) {
41937 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
41938 return NewOp;
41939
41940 SDValue InputVector = N->getOperand(0);
41941 SDValue EltIdx = N->getOperand(1);
41942 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
41943
41944 EVT SrcVT = InputVector.getValueType();
41945 EVT VT = N->getValueType(0);
41946 SDLoc dl(InputVector);
41947 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
41948 unsigned NumSrcElts = SrcVT.getVectorNumElements();
41949
41950 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
41951 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41952
41953 // Integer Constant Folding.
41954 if (CIdx && VT.isInteger()) {
41955 APInt UndefVecElts;
41956 SmallVector<APInt, 16> EltBits;
41957 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
41958 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
41959 EltBits, true, false)) {
41960 uint64_t Idx = CIdx->getZExtValue();
41961 if (UndefVecElts[Idx])
41962 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
41963 return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
41964 dl, VT);
41965 }
41966 }
41967
41968 if (IsPextr) {
41969 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41970 if (TLI.SimplifyDemandedBits(
41971 SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
41972 return SDValue(N, 0);
41973
41974 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
41975 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
41976 InputVector.getOpcode() == X86ISD::PINSRW) &&
41977 InputVector.getOperand(2) == EltIdx) {
41978 assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41979, __extension__ __PRETTY_FUNCTION__))
41979 "Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41979, __extension__ __PRETTY_FUNCTION__))
;
41980 SDValue Scl = InputVector.getOperand(1);
41981 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
41982 return DAG.getZExtOrTrunc(Scl, dl, VT);
41983 }
41984
41985 // TODO - Remove this once we can handle the implicit zero-extension of
41986 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
41987 // combineBasicSADPattern.
41988 return SDValue();
41989 }
41990
41991 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
41992 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
41993 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
41994 SDValue MMXSrc = InputVector.getOperand(0);
41995
41996 // The bitcast source is a direct mmx result.
41997 if (MMXSrc.getValueType() == MVT::x86mmx)
41998 return DAG.getBitcast(VT, InputVector);
41999 }
42000
42001 // Detect mmx to i32 conversion through a v2i32 elt extract.
42002 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
42003 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
42004 SDValue MMXSrc = InputVector.getOperand(0);
42005
42006 // The bitcast source is a direct mmx result.
42007 if (MMXSrc.getValueType() == MVT::x86mmx)
42008 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
42009 }
42010
42011 // Check whether this extract is the root of a sum of absolute differences
42012 // pattern. This has to be done here because we really want it to happen
42013 // pre-legalization,
42014 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
42015 return SAD;
42016
42017 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
42018 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
42019 return Cmp;
42020
42021 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
42022 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
42023 return MinMax;
42024
42025 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
42026 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
42027 return V;
42028
42029 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
42030 return V;
42031
42032 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
42033 // and then testing the relevant element.
42034 //
42035 // Note that we only combine extracts on the *same* result number, i.e.
42036 // t0 = merge_values a0, a1, a2, a3
42037 // i1 = extract_vector_elt t0, Constant:i64<2>
42038 // i1 = extract_vector_elt t0, Constant:i64<3>
42039 // but not
42040 // i1 = extract_vector_elt t0:1, Constant:i64<2>
42041 // since the latter would need its own MOVMSK.
42042 if (CIdx && SrcVT.getScalarType() == MVT::i1) {
42043 SmallVector<SDNode *, 16> BoolExtracts;
42044 unsigned ResNo = InputVector.getResNo();
42045 auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
42046 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42047 isa<ConstantSDNode>(Use->getOperand(1)) &&
42048 Use->getOperand(0).getResNo() == ResNo &&
42049 Use->getValueType(0) == MVT::i1) {
42050 BoolExtracts.push_back(Use);
42051 return true;
42052 }
42053 return false;
42054 };
42055 if (all_of(InputVector->uses(), IsBoolExtract) &&
42056 BoolExtracts.size() > 1) {
42057 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
42058 if (SDValue BC =
42059 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
42060 for (SDNode *Use : BoolExtracts) {
42061 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
42062 unsigned MaskIdx = Use->getConstantOperandVal(1);
42063 APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
42064 SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
42065 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
42066 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
42067 DCI.CombineTo(Use, Res);
42068 }
42069 return SDValue(N, 0);
42070 }
42071 }
42072 }
42073
42074 return SDValue();
42075}
42076
42077/// If a vector select has an operand that is -1 or 0, try to simplify the
42078/// select to a bitwise logic operation.
42079/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
42080static SDValue
42081combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
42082 TargetLowering::DAGCombinerInfo &DCI,
42083 const X86Subtarget &Subtarget) {
42084 SDValue Cond = N->getOperand(0);
42085 SDValue LHS = N->getOperand(1);
42086 SDValue RHS = N->getOperand(2);
42087 EVT VT = LHS.getValueType();
42088 EVT CondVT = Cond.getValueType();
42089 SDLoc DL(N);
42090 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42091
42092 if (N->getOpcode() != ISD::VSELECT)
42093 return SDValue();
42094
42095 assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42095, __extension__ __PRETTY_FUNCTION__))
;
42096
42097 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
42098 // TODO: Can we assert that both operands are not zeros (because that should
42099 // get simplified at node creation time)?
42100 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
42101 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
42102
42103 // If both inputs are 0/undef, create a complete zero vector.
42104 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
42105 if (TValIsAllZeros && FValIsAllZeros) {
42106 if (VT.isFloatingPoint())
42107 return DAG.getConstantFP(0.0, DL, VT);
42108 return DAG.getConstant(0, DL, VT);
42109 }
42110
42111 // To use the condition operand as a bitwise mask, it must have elements that
42112 // are the same size as the select elements. Ie, the condition operand must
42113 // have already been promoted from the IR select condition type <N x i1>.
42114 // Don't check if the types themselves are equal because that excludes
42115 // vector floating-point selects.
42116 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
42117 return SDValue();
42118
42119 // Try to invert the condition if true value is not all 1s and false value is
42120 // not all 0s. Only do this if the condition has one use.
42121 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
42122 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
42123 // Check if the selector will be produced by CMPP*/PCMP*.
42124 Cond.getOpcode() == ISD::SETCC &&
42125 // Check if SETCC has already been promoted.
42126 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
42127 CondVT) {
42128 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
42129
42130 if (TValIsAllZeros || FValIsAllOnes) {
42131 SDValue CC = Cond.getOperand(2);
42132 ISD::CondCode NewCC = ISD::getSetCCInverse(
42133 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
42134 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
42135 NewCC);
42136 std::swap(LHS, RHS);
42137 TValIsAllOnes = FValIsAllOnes;
42138 FValIsAllZeros = TValIsAllZeros;
42139 }
42140 }
42141
42142 // Cond value must be 'sign splat' to be converted to a logical op.
42143 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
42144 return SDValue();
42145
42146 // vselect Cond, 111..., 000... -> Cond
42147 if (TValIsAllOnes && FValIsAllZeros)
42148 return DAG.getBitcast(VT, Cond);
42149
42150 if (!TLI.isTypeLegal(CondVT))
42151 return SDValue();
42152
42153 // vselect Cond, 111..., X -> or Cond, X
42154 if (TValIsAllOnes) {
42155 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
42156 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
42157 return DAG.getBitcast(VT, Or);
42158 }
42159
42160 // vselect Cond, X, 000... -> and Cond, X
42161 if (FValIsAllZeros) {
42162 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
42163 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
42164 return DAG.getBitcast(VT, And);
42165 }
42166
42167 // vselect Cond, 000..., X -> andn Cond, X
42168 if (TValIsAllZeros) {
42169 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
42170 SDValue AndN;
42171 // The canonical form differs for i1 vectors - x86andnp is not used
42172 if (CondVT.getScalarType() == MVT::i1)
42173 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
42174 CastRHS);
42175 else
42176 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
42177 return DAG.getBitcast(VT, AndN);
42178 }
42179
42180 return SDValue();
42181}
42182
42183/// If both arms of a vector select are concatenated vectors, split the select,
42184/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
42185/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
42186/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
42187static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
42188 const X86Subtarget &Subtarget) {
42189 unsigned Opcode = N->getOpcode();
42190 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
42191 return SDValue();
42192
42193 // TODO: Split 512-bit vectors too?
42194 EVT VT = N->getValueType(0);
42195 if (!VT.is256BitVector())
42196 return SDValue();
42197
42198 // TODO: Split as long as any 2 of the 3 operands are concatenated?
42199 SDValue Cond = N->getOperand(0);
42200 SDValue TVal = N->getOperand(1);
42201 SDValue FVal = N->getOperand(2);
42202 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
42203 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
42204 !collectConcatOps(TVal.getNode(), CatOpsT) ||
42205 !collectConcatOps(FVal.getNode(), CatOpsF))
42206 return SDValue();
42207
42208 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
42209 ArrayRef<SDValue> Ops) {
42210 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
42211 };
42212 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
42213 makeBlend, /*CheckBWI*/ false);
42214}
42215
42216static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
42217 SDValue Cond = N->getOperand(0);
42218 SDValue LHS = N->getOperand(1);
42219 SDValue RHS = N->getOperand(2);
42220 SDLoc DL(N);
42221
42222 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
42223 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
42224 if (!TrueC || !FalseC)
42225 return SDValue();
42226
42227 // Don't do this for crazy integer types.
42228 EVT VT = N->getValueType(0);
42229 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
42230 return SDValue();
42231
42232 // We're going to use the condition bit in math or logic ops. We could allow
42233 // this with a wider condition value (post-legalization it becomes an i8),
42234 // but if nothing is creating selects that late, it doesn't matter.
42235 if (Cond.getValueType() != MVT::i1)
42236 return SDValue();
42237
42238 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
42239 // 3, 5, or 9 with i32/i64, so those get transformed too.
42240 // TODO: For constants that overflow or do not differ by power-of-2 or small
42241 // multiplier, convert to 'and' + 'add'.
42242 const APInt &TrueVal = TrueC->getAPIntValue();
42243 const APInt &FalseVal = FalseC->getAPIntValue();
42244 bool OV;
42245 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
42246 if (OV)
42247 return SDValue();
42248
42249 APInt AbsDiff = Diff.abs();
42250 if (AbsDiff.isPowerOf2() ||
42251 ((VT == MVT::i32 || VT == MVT::i64) &&
42252 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
42253
42254 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
42255 // of the condition can usually be folded into a compare predicate, but even
42256 // without that, the sequence should be cheaper than a CMOV alternative.
42257 if (TrueVal.slt(FalseVal)) {
42258 Cond = DAG.getNOT(DL, Cond, MVT::i1);
42259 std::swap(TrueC, FalseC);
42260 }
42261
42262 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
42263 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
42264
42265 // Multiply condition by the difference if non-one.
42266 if (!AbsDiff.isOneValue())
42267 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
42268
42269 // Add the base if non-zero.
42270 if (!FalseC->isNullValue())
42271 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
42272
42273 return R;
42274 }
42275
42276 return SDValue();
42277}
42278
42279/// If this is a *dynamic* select (non-constant condition) and we can match
42280/// this node with one of the variable blend instructions, restructure the
42281/// condition so that blends can use the high (sign) bit of each element.
42282/// This function will also call SimplifyDemandedBits on already created
42283/// BLENDV to perform additional simplifications.
42284static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
42285 TargetLowering::DAGCombinerInfo &DCI,
42286 const X86Subtarget &Subtarget) {
42287 SDValue Cond = N->getOperand(0);
42288 if ((N->getOpcode() != ISD::VSELECT &&
42289 N->getOpcode() != X86ISD::BLENDV) ||
42290 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
42291 return SDValue();
42292
42293 // Don't optimize before the condition has been transformed to a legal type
42294 // and don't ever optimize vector selects that map to AVX512 mask-registers.
42295 unsigned BitWidth = Cond.getScalarValueSizeInBits();
42296 if (BitWidth < 8 || BitWidth > 64)
42297 return SDValue();
42298
42299 // We can only handle the cases where VSELECT is directly legal on the
42300 // subtarget. We custom lower VSELECT nodes with constant conditions and
42301 // this makes it hard to see whether a dynamic VSELECT will correctly
42302 // lower, so we both check the operation's status and explicitly handle the
42303 // cases where a *dynamic* blend will fail even though a constant-condition
42304 // blend could be custom lowered.
42305 // FIXME: We should find a better way to handle this class of problems.
42306 // Potentially, we should combine constant-condition vselect nodes
42307 // pre-legalization into shuffles and not mark as many types as custom
42308 // lowered.
42309 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42310 EVT VT = N->getValueType(0);
42311 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
42312 return SDValue();
42313 // FIXME: We don't support i16-element blends currently. We could and
42314 // should support them by making *all* the bits in the condition be set
42315 // rather than just the high bit and using an i8-element blend.
42316 if (VT.getVectorElementType() == MVT::i16)
42317 return SDValue();
42318 // Dynamic blending was only available from SSE4.1 onward.
42319 if (VT.is128BitVector() && !Subtarget.hasSSE41())
42320 return SDValue();
42321 // Byte blends are only available in AVX2
42322 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
42323 return SDValue();
42324 // There are no 512-bit blend instructions that use sign bits.
42325 if (VT.is512BitVector())
42326 return SDValue();
42327
42328 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
42329 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
42330 UI != UE; ++UI)
42331 if ((UI->getOpcode() != ISD::VSELECT &&
42332 UI->getOpcode() != X86ISD::BLENDV) ||
42333 UI.getOperandNo() != 0)
42334 return false;
42335
42336 return true;
42337 };
42338
42339 APInt DemandedBits(APInt::getSignMask(BitWidth));
42340
42341 if (OnlyUsedAsSelectCond(Cond)) {
42342 KnownBits Known;
42343 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
42344 !DCI.isBeforeLegalizeOps());
42345 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
42346 return SDValue();
42347
42348 // If we changed the computation somewhere in the DAG, this change will
42349 // affect all users of Cond. Update all the nodes so that we do not use
42350 // the generic VSELECT anymore. Otherwise, we may perform wrong
42351 // optimizations as we messed with the actual expectation for the vector
42352 // boolean values.
42353 for (SDNode *U : Cond->uses()) {
42354 if (U->getOpcode() == X86ISD::BLENDV)
42355 continue;
42356
42357 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
42358 Cond, U->getOperand(1), U->getOperand(2));
42359 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
42360 DCI.AddToWorklist(U);
42361 }
42362 DCI.CommitTargetLoweringOpt(TLO);
42363 return SDValue(N, 0);
42364 }
42365
42366 // Otherwise we can still at least try to simplify multiple use bits.
42367 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
42368 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
42369 N->getOperand(1), N->getOperand(2));
42370
42371 return SDValue();
42372}
42373
42374// Try to match:
42375// (or (and (M, (sub 0, X)), (pandn M, X)))
42376// which is a special case of:
42377// (select M, (sub 0, X), X)
42378// Per:
42379// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
42380// We know that, if fNegate is 0 or 1:
42381// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
42382//
42383// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
42384// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
42385// ( M ? -X : X) == ((X ^ M ) + (M & 1))
42386// This lets us transform our vselect to:
42387// (add (xor X, M), (and M, 1))
42388// And further to:
42389// (sub (xor X, M), M)
42390static SDValue combineLogicBlendIntoConditionalNegate(
42391 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
42392 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
42393 EVT MaskVT = Mask.getValueType();
42394 assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42396, __extension__ __PRETTY_FUNCTION__))
42395 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42396, __extension__ __PRETTY_FUNCTION__))
42396 "Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42396, __extension__ __PRETTY_FUNCTION__))
;
42397
42398 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
42399 return SDValue();
42400 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
42401 return SDValue();
42402
42403 auto IsNegV = [](SDNode *N, SDValue V) {
42404 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
42405 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
42406 };
42407
42408 SDValue V;
42409 if (IsNegV(Y.getNode(), X))
42410 V = X;
42411 else if (IsNegV(X.getNode(), Y))
42412 V = Y;
42413 else
42414 return SDValue();
42415
42416 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
42417 SDValue SubOp2 = Mask;
42418
42419 // If the negate was on the false side of the select, then
42420 // the operands of the SUB need to be swapped. PR 27251.
42421 // This is because the pattern being matched above is
42422 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
42423 // but if the pattern matched was
42424 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
42425 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
42426 // pattern also needs to be a negation of the replacement pattern above.
42427 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
42428 // sub accomplishes the negation of the replacement pattern.
42429 if (V == Y)
42430 std::swap(SubOp1, SubOp2);
42431
42432 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
42433 return DAG.getBitcast(VT, Res);
42434}
42435
42436/// Do target-specific dag combines on SELECT and VSELECT nodes.
42437static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
42438 TargetLowering::DAGCombinerInfo &DCI,
42439 const X86Subtarget &Subtarget) {
42440 SDLoc DL(N);
42441 SDValue Cond = N->getOperand(0);
42442 SDValue LHS = N->getOperand(1);
42443 SDValue RHS = N->getOperand(2);
42444
42445 // Try simplification again because we use this function to optimize
42446 // BLENDV nodes that are not handled by the generic combiner.
42447 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
42448 return V;
42449
42450 EVT VT = LHS.getValueType();
42451 EVT CondVT = Cond.getValueType();
42452 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42453 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
42454
42455 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
42456 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
42457 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
42458 if (CondVT.isVector() && CondVT.isInteger() &&
42459 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
42460 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
42461 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
42462 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
42463 DL, DAG, Subtarget))
42464 return V;
42465
42466 // Convert vselects with constant condition into shuffles.
42467 if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
42468 SmallVector<int, 64> Mask;
42469 if (createShuffleMaskFromVSELECT(Mask, Cond))
42470 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
42471 }
42472
42473 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
42474 // by forcing the unselected elements to zero.
42475 // TODO: Can we handle more shuffles with this?
42476 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
42477 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
42478 LHS.hasOneUse() && RHS.hasOneUse()) {
42479 MVT SimpleVT = VT.getSimpleVT();
42480 SmallVector<SDValue, 1> LHSOps, RHSOps;
42481 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
42482 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
42483 getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&
42484 getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {
42485 int NumElts = VT.getVectorNumElements();
42486 for (int i = 0; i != NumElts; ++i) {
42487 if (CondMask[i] < NumElts)
42488 RHSMask[i] = 0x80;
42489 else
42490 LHSMask[i] = 0x80;
42491 }
42492 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
42493 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
42494 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
42495 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
42496 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
42497 }
42498 }
42499
42500 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
42501 // instructions match the semantics of the common C idiom x<y?x:y but not
42502 // x<=y?x:y, because of how they handle negative zero (which can be
42503 // ignored in unsafe-math mode).
42504 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
42505 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
42506 VT != MVT::f80 && VT != MVT::f128 &&
42507 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
42508 (Subtarget.hasSSE2() ||
42509 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
42510 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
42511
42512 unsigned Opcode = 0;
42513 // Check for x CC y ? x : y.
42514 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
42515 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
42516 switch (CC) {
42517 default: break;
42518 case ISD::SETULT:
42519 // Converting this to a min would handle NaNs incorrectly, and swapping
42520 // the operands would cause it to handle comparisons between positive
42521 // and negative zero incorrectly.
42522 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
42523 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42524 !(DAG.isKnownNeverZeroFloat(LHS) ||
42525 DAG.isKnownNeverZeroFloat(RHS)))
42526 break;
42527 std::swap(LHS, RHS);
42528 }
42529 Opcode = X86ISD::FMIN;
42530 break;
42531 case ISD::SETOLE:
42532 // Converting this to a min would handle comparisons between positive
42533 // and negative zero incorrectly.
42534 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42535 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
42536 break;
42537 Opcode = X86ISD::FMIN;
42538 break;
42539 case ISD::SETULE:
42540 // Converting this to a min would handle both negative zeros and NaNs
42541 // incorrectly, but we can swap the operands to fix both.
42542 std::swap(LHS, RHS);
42543 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42544 case ISD::SETOLT:
42545 case ISD::SETLT:
42546 case ISD::SETLE:
42547 Opcode = X86ISD::FMIN;
42548 break;
42549
42550 case ISD::SETOGE:
42551 // Converting this to a max would handle comparisons between positive
42552 // and negative zero incorrectly.
42553 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42554 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
42555 break;
42556 Opcode = X86ISD::FMAX;
42557 break;
42558 case ISD::SETUGT:
42559 // Converting this to a max would handle NaNs incorrectly, and swapping
42560 // the operands would cause it to handle comparisons between positive
42561 // and negative zero incorrectly.
42562 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
42563 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42564 !(DAG.isKnownNeverZeroFloat(LHS) ||
42565 DAG.isKnownNeverZeroFloat(RHS)))
42566 break;
42567 std::swap(LHS, RHS);
42568 }
42569 Opcode = X86ISD::FMAX;
42570 break;
42571 case ISD::SETUGE:
42572 // Converting this to a max would handle both negative zeros and NaNs
42573 // incorrectly, but we can swap the operands to fix both.
42574 std::swap(LHS, RHS);
42575 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42576 case ISD::SETOGT:
42577 case ISD::SETGT:
42578 case ISD::SETGE:
42579 Opcode = X86ISD::FMAX;
42580 break;
42581 }
42582 // Check for x CC y ? y : x -- a min/max with reversed arms.
42583 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
42584 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
42585 switch (CC) {
42586 default: break;
42587 case ISD::SETOGE:
42588 // Converting this to a min would handle comparisons between positive
42589 // and negative zero incorrectly, and swapping the operands would
42590 // cause it to handle NaNs incorrectly.
42591 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42592 !(DAG.isKnownNeverZeroFloat(LHS) ||
42593 DAG.isKnownNeverZeroFloat(RHS))) {
42594 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
42595 break;
42596 std::swap(LHS, RHS);
42597 }
42598 Opcode = X86ISD::FMIN;
42599 break;
42600 case ISD::SETUGT:
42601 // Converting this to a min would handle NaNs incorrectly.
42602 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
42603 break;
42604 Opcode = X86ISD::FMIN;
42605 break;
42606 case ISD::SETUGE:
42607 // Converting this to a min would handle both negative zeros and NaNs
42608 // incorrectly, but we can swap the operands to fix both.
42609 std::swap(LHS, RHS);
42610 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42611 case ISD::SETOGT:
42612 case ISD::SETGT:
42613 case ISD::SETGE:
42614 Opcode = X86ISD::FMIN;
42615 break;
42616
42617 case ISD::SETULT:
42618 // Converting this to a max would handle NaNs incorrectly.
42619 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
42620 break;
42621 Opcode = X86ISD::FMAX;
42622 break;
42623 case ISD::SETOLE:
42624 // Converting this to a max would handle comparisons between positive
42625 // and negative zero incorrectly, and swapping the operands would
42626 // cause it to handle NaNs incorrectly.
42627 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
42628 !DAG.isKnownNeverZeroFloat(LHS) &&
42629 !DAG.isKnownNeverZeroFloat(RHS)) {
42630 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
42631 break;
42632 std::swap(LHS, RHS);
42633 }
42634 Opcode = X86ISD::FMAX;
42635 break;
42636 case ISD::SETULE:
42637 // Converting this to a max would handle both negative zeros and NaNs
42638 // incorrectly, but we can swap the operands to fix both.
42639 std::swap(LHS, RHS);
42640 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42641 case ISD::SETOLT:
42642 case ISD::SETLT:
42643 case ISD::SETLE:
42644 Opcode = X86ISD::FMAX;
42645 break;
42646 }
42647 }
42648
42649 if (Opcode)
42650 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
42651 }
42652
42653 // Some mask scalar intrinsics rely on checking if only one bit is set
42654 // and implement it in C code like this:
42655 // A[0] = (U & 1) ? A[0] : W[0];
42656 // This creates some redundant instructions that break pattern matching.
42657 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
42658 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
42659 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
42660 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
42661 SDValue AndNode = Cond.getOperand(0);
42662 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
42663 isNullConstant(Cond.getOperand(1)) &&
42664 isOneConstant(AndNode.getOperand(1))) {
42665 // LHS and RHS swapped due to
42666 // setcc outputting 1 when AND resulted in 0 and vice versa.
42667 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
42668 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
42669 }
42670 }
42671
42672 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
42673 // lowering on KNL. In this case we convert it to
42674 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
42675 // The same situation all vectors of i8 and i16 without BWI.
42676 // Make sure we extend these even before type legalization gets a chance to
42677 // split wide vectors.
42678 // Since SKX these selects have a proper lowering.
42679 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
42680 CondVT.getVectorElementType() == MVT::i1 &&
42681 (VT.getVectorElementType() == MVT::i8 ||
42682 VT.getVectorElementType() == MVT::i16)) {
42683 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
42684 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
42685 }
42686
42687 // AVX512 - Extend select with zero to merge with target shuffle.
42688 // select(mask, extract_subvector(shuffle(x)), zero) -->
42689 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
42690 // TODO - support non target shuffles as well.
42691 if (Subtarget.hasAVX512() && CondVT.isVector() &&
42692 CondVT.getVectorElementType() == MVT::i1) {
42693 auto SelectableOp = [&TLI](SDValue Op) {
42694 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42695 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
42696 isNullConstant(Op.getOperand(1)) &&
42697 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
42698 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
42699 };
42700
42701 bool SelectableLHS = SelectableOp(LHS);
42702 bool SelectableRHS = SelectableOp(RHS);
42703 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
42704 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
42705
42706 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
42707 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
42708 : RHS.getOperand(0).getValueType();
42709 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
42710 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
42711 VT.getSizeInBits());
42712 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
42713 VT.getSizeInBits());
42714 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
42715 DAG.getUNDEF(SrcCondVT), Cond,
42716 DAG.getIntPtrConstant(0, DL));
42717 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
42718 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
42719 }
42720 }
42721
42722 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
42723 return V;
42724
42725 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
42726 Cond.hasOneUse()) {
42727 EVT CondVT = Cond.getValueType();
42728 SDValue Cond0 = Cond.getOperand(0);
42729 SDValue Cond1 = Cond.getOperand(1);
42730 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
42731
42732 // Canonicalize min/max:
42733 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
42734 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
42735 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
42736 // the need for an extra compare against zero. e.g.
42737 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
42738 // subl %esi, %edi
42739 // testl %edi, %edi
42740 // movl $0, %eax
42741 // cmovgl %edi, %eax
42742 // =>
42743 // xorl %eax, %eax
42744 // subl %esi, $edi
42745 // cmovsl %eax, %edi
42746 //
42747 // We can also canonicalize
42748 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
42749 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
42750 // This allows the use of a test instruction for the compare.
42751 if (LHS == Cond0 && RHS == Cond1) {
42752 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
42753 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
42754 ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;
42755 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42756 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42757 }
42758 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
42759 ISD::CondCode NewCC = ISD::SETUGE;
42760 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
42761 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
42762 }
42763 }
42764
42765 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
42766 // fold eq + gt/lt nested selects into ge/le selects
42767 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
42768 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
42769 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
42770 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
42771 // .. etc ..
42772 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
42773 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
42774 SDValue InnerSetCC = RHS.getOperand(0);
42775 ISD::CondCode InnerCC =
42776 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
42777 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
42778 Cond0 == InnerSetCC.getOperand(0) &&
42779 Cond1 == InnerSetCC.getOperand(1)) {
42780 ISD::CondCode NewCC;
42781 switch (CC == ISD::SETEQ ? InnerCC : CC) {
42782 case ISD::SETGT: NewCC = ISD::SETGE; break;
42783 case ISD::SETLT: NewCC = ISD::SETLE; break;
42784 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
42785 case ISD::SETULT: NewCC = ISD::SETULE; break;
42786 default: NewCC = ISD::SETCC_INVALID; break;
42787 }
42788 if (NewCC != ISD::SETCC_INVALID) {
42789 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
42790 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
42791 }
42792 }
42793 }
42794 }
42795
42796 // Check if the first operand is all zeros and Cond type is vXi1.
42797 // If this an avx512 target we can improve the use of zero masking by
42798 // swapping the operands and inverting the condition.
42799 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
42800 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
42801 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
42802 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
42803 // Invert the cond to not(cond) : xor(op,allones)=not(op)
42804 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
42805 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
42806 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
42807 }
42808
42809 // Early exit check
42810 if (!TLI.isTypeLegal(VT))
42811 return SDValue();
42812
42813 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
42814 return V;
42815
42816 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
42817 return V;
42818
42819 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
42820 return V;
42821
42822 // select(~Cond, X, Y) -> select(Cond, Y, X)
42823 if (CondVT.getScalarType() != MVT::i1) {
42824 if (SDValue CondNot = IsNOT(Cond, DAG))
42825 return DAG.getNode(N->getOpcode(), DL, VT,
42826 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
42827 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the signbit.
42828 if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse() &&
42829 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {
42830 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
42831 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
42832 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
42833 }
42834 }
42835
42836 // Try to optimize vXi1 selects if both operands are either all constants or
42837 // bitcasts from scalar integer type. In that case we can convert the operands
42838 // to integer and use an integer select which will be converted to a CMOV.
42839 // We need to take a little bit of care to avoid creating an i64 type after
42840 // type legalization.
42841 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
42842 VT.getVectorElementType() == MVT::i1 &&
42843 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
42844 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
42845 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
42846 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
42847
42848 if ((LHSIsConst ||
42849 (LHS.getOpcode() == ISD::BITCAST &&
42850 LHS.getOperand(0).getValueType() == IntVT)) &&
42851 (RHSIsConst ||
42852 (RHS.getOpcode() == ISD::BITCAST &&
42853 RHS.getOperand(0).getValueType() == IntVT))) {
42854 if (LHSIsConst)
42855 LHS = combinevXi1ConstantToInteger(LHS, DAG);
42856 else
42857 LHS = LHS.getOperand(0);
42858
42859 if (RHSIsConst)
42860 RHS = combinevXi1ConstantToInteger(RHS, DAG);
42861 else
42862 RHS = RHS.getOperand(0);
42863
42864 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
42865 return DAG.getBitcast(VT, Select);
42866 }
42867 }
42868
42869 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
42870 // single bits, then invert the predicate and swap the select operands.
42871 // This can lower using a vector shift bit-hack rather than mask and compare.
42872 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
42873 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
42874 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
42875 Cond.getOperand(0).getOpcode() == ISD::AND &&
42876 isNullOrNullSplat(Cond.getOperand(1)) &&
42877 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
42878 Cond.getOperand(0).getValueType() == VT) {
42879 // The 'and' mask must be composed of power-of-2 constants.
42880 SDValue And = Cond.getOperand(0);
42881 auto *C = isConstOrConstSplat(And.getOperand(1));
42882 if (C && C->getAPIntValue().isPowerOf2()) {
42883 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
42884 SDValue NotCond =
42885 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
42886 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
42887 }
42888
42889 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
42890 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
42891 // 16-bit lacks a proper blendv.
42892 unsigned EltBitWidth = VT.getScalarSizeInBits();
42893 bool CanShiftBlend =
42894 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
42895 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
42896 (Subtarget.hasXOP()));
42897 if (CanShiftBlend &&
42898 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
42899 return C->getAPIntValue().isPowerOf2();
42900 })) {
42901 // Create a left-shift constant to get the mask bits over to the sign-bit.
42902 SDValue Mask = And.getOperand(1);
42903 SmallVector<int, 32> ShlVals;
42904 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
42905 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
42906 ShlVals.push_back(EltBitWidth - 1 -
42907 MaskVal->getAPIntValue().exactLogBase2());
42908 }
42909 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
42910 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
42911 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
42912 SDValue NewCond =
42913 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
42914 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
42915 }
42916 }
42917
42918 return SDValue();
42919}
42920
42921/// Combine:
42922/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
42923/// to:
42924/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
42925/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
42926/// Note that this is only legal for some op/cc combinations.
42927static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
42928 SelectionDAG &DAG,
42929 const X86Subtarget &Subtarget) {
42930 // This combine only operates on CMP-like nodes.
42931 if (!(Cmp.getOpcode() == X86ISD::CMP ||
42932 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
42933 return SDValue();
42934
42935 // Can't replace the cmp if it has more uses than the one we're looking at.
42936 // FIXME: We would like to be able to handle this, but would need to make sure
42937 // all uses were updated.
42938 if (!Cmp.hasOneUse())
42939 return SDValue();
42940
42941 // This only applies to variations of the common case:
42942 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
42943 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
42944 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
42945 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
42946 // Using the proper condcodes (see below), overflow is checked for.
42947
42948 // FIXME: We can generalize both constraints:
42949 // - XOR/OR/AND (if they were made to survive AtomicExpand)
42950 // - LHS != 1
42951 // if the result is compared.
42952
42953 SDValue CmpLHS = Cmp.getOperand(0);
42954 SDValue CmpRHS = Cmp.getOperand(1);
42955 EVT CmpVT = CmpLHS.getValueType();
42956
42957 if (!CmpLHS.hasOneUse())
42958 return SDValue();
42959
42960 unsigned Opc = CmpLHS.getOpcode();
42961 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
42962 return SDValue();
42963
42964 SDValue OpRHS = CmpLHS.getOperand(2);
42965 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
42966 if (!OpRHSC)
42967 return SDValue();
42968
42969 APInt Addend = OpRHSC->getAPIntValue();
42970 if (Opc == ISD::ATOMIC_LOAD_SUB)
42971 Addend = -Addend;
42972
42973 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
42974 if (!CmpRHSC)
42975 return SDValue();
42976
42977 APInt Comparison = CmpRHSC->getAPIntValue();
42978 APInt NegAddend = -Addend;
42979
42980 // See if we can adjust the CC to make the comparison match the negated
42981 // addend.
42982 if (Comparison != NegAddend) {
42983 APInt IncComparison = Comparison + 1;
42984 if (IncComparison == NegAddend) {
42985 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
42986 Comparison = IncComparison;
42987 CC = X86::COND_AE;
42988 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
42989 Comparison = IncComparison;
42990 CC = X86::COND_L;
42991 }
42992 }
42993 APInt DecComparison = Comparison - 1;
42994 if (DecComparison == NegAddend) {
42995 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
42996 Comparison = DecComparison;
42997 CC = X86::COND_A;
42998 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
42999 Comparison = DecComparison;
43000 CC = X86::COND_LE;
43001 }
43002 }
43003 }
43004
43005 // If the addend is the negation of the comparison value, then we can do
43006 // a full comparison by emitting the atomic arithmetic as a locked sub.
43007 if (Comparison == NegAddend) {
43008 // The CC is fine, but we need to rewrite the LHS of the comparison as an
43009 // atomic sub.
43010 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
43011 auto AtomicSub = DAG.getAtomic(
43012 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
43013 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
43014 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
43015 AN->getMemOperand());
43016 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
43017 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
43018 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
43019 return LockOp;
43020 }
43021
43022 // We can handle comparisons with zero in a number of cases by manipulating
43023 // the CC used.
43024 if (!Comparison.isNullValue())
43025 return SDValue();
43026
43027 if (CC == X86::COND_S && Addend == 1)
43028 CC = X86::COND_LE;
43029 else if (CC == X86::COND_NS && Addend == 1)
43030 CC = X86::COND_G;
43031 else if (CC == X86::COND_G && Addend == -1)
43032 CC = X86::COND_GE;
43033 else if (CC == X86::COND_LE && Addend == -1)
43034 CC = X86::COND_L;
43035 else
43036 return SDValue();
43037
43038 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
43039 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
43040 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
43041 return LockOp;
43042}
43043
43044// Check whether a boolean test is testing a boolean value generated by
43045// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
43046// code.
43047//
43048// Simplify the following patterns:
43049// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
43050// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
43051// to (Op EFLAGS Cond)
43052//
43053// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
43054// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
43055// to (Op EFLAGS !Cond)
43056//
43057// where Op could be BRCOND or CMOV.
43058//
43059static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
43060 // This combine only operates on CMP-like nodes.
43061 if (!(Cmp.getOpcode() == X86ISD::CMP ||
43062 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
43063 return SDValue();
43064
43065 // Quit if not used as a boolean value.
43066 if (CC != X86::COND_E && CC != X86::COND_NE)
43067 return SDValue();
43068
43069 // Check CMP operands. One of them should be 0 or 1 and the other should be
43070 // an SetCC or extended from it.
43071 SDValue Op1 = Cmp.getOperand(0);
43072 SDValue Op2 = Cmp.getOperand(1);
43073
43074 SDValue SetCC;
43075 const ConstantSDNode* C = nullptr;
43076 bool needOppositeCond = (CC == X86::COND_E);
43077 bool checkAgainstTrue = false; // Is it a comparison against 1?
43078
43079 if ((C = dyn_cast<ConstantSDNode>(Op1)))
43080 SetCC = Op2;
43081 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
43082 SetCC = Op1;
43083 else // Quit if all operands are not constants.
43084 return SDValue();
43085
43086 if (C->getZExtValue() == 1) {
43087 needOppositeCond = !needOppositeCond;
43088 checkAgainstTrue = true;
43089 } else if (C->getZExtValue() != 0)
43090 // Quit if the constant is neither 0 or 1.
43091 return SDValue();
43092
43093 bool truncatedToBoolWithAnd = false;
43094 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
43095 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
43096 SetCC.getOpcode() == ISD::TRUNCATE ||
43097 SetCC.getOpcode() == ISD::AND) {
43098 if (SetCC.getOpcode() == ISD::AND) {
43099 int OpIdx = -1;
43100 if (isOneConstant(SetCC.getOperand(0)))
43101 OpIdx = 1;
43102 if (isOneConstant(SetCC.getOperand(1)))
43103 OpIdx = 0;
43104 if (OpIdx < 0)
43105 break;
43106 SetCC = SetCC.getOperand(OpIdx);
43107 truncatedToBoolWithAnd = true;
43108 } else
43109 SetCC = SetCC.getOperand(0);
43110 }
43111
43112 switch (SetCC.getOpcode()) {
43113 case X86ISD::SETCC_CARRY:
43114 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
43115 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
43116 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
43117 // truncated to i1 using 'and'.
43118 if (checkAgainstTrue && !truncatedToBoolWithAnd)
43119 break;
43120 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43121, __extension__ __PRETTY_FUNCTION__))
43121 "Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43121, __extension__ __PRETTY_FUNCTION__))
;
43122 LLVM_FALLTHROUGH[[gnu::fallthrough]];
43123 case X86ISD::SETCC:
43124 // Set the condition code or opposite one if necessary.
43125 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
43126 if (needOppositeCond)
43127 CC = X86::GetOppositeBranchCondition(CC);
43128 return SetCC.getOperand(1);
43129 case X86ISD::CMOV: {
43130 // Check whether false/true value has canonical one, i.e. 0 or 1.
43131 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
43132 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
43133 // Quit if true value is not a constant.
43134 if (!TVal)
43135 return SDValue();
43136 // Quit if false value is not a constant.
43137 if (!FVal) {
43138 SDValue Op = SetCC.getOperand(0);
43139 // Skip 'zext' or 'trunc' node.
43140 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
43141 Op.getOpcode() == ISD::TRUNCATE)
43142 Op = Op.getOperand(0);
43143 // A special case for rdrand/rdseed, where 0 is set if false cond is
43144 // found.
43145 if ((Op.getOpcode() != X86ISD::RDRAND &&
43146 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
43147 return SDValue();
43148 }
43149 // Quit if false value is not the constant 0 or 1.
43150 bool FValIsFalse = true;
43151 if (FVal && FVal->getZExtValue() != 0) {
43152 if (FVal->getZExtValue() != 1)
43153 return SDValue();
43154 // If FVal is 1, opposite cond is needed.
43155 needOppositeCond = !needOppositeCond;
43156 FValIsFalse = false;
43157 }
43158 // Quit if TVal is not the constant opposite of FVal.
43159 if (FValIsFalse && TVal->getZExtValue() != 1)
43160 return SDValue();
43161 if (!FValIsFalse && TVal->getZExtValue() != 0)
43162 return SDValue();
43163 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
43164 if (needOppositeCond)
43165 CC = X86::GetOppositeBranchCondition(CC);
43166 return SetCC.getOperand(3);
43167 }
43168 }
43169
43170 return SDValue();
43171}
43172
43173/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
43174/// Match:
43175/// (X86or (X86setcc) (X86setcc))
43176/// (X86cmp (and (X86setcc) (X86setcc)), 0)
43177static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
43178 X86::CondCode &CC1, SDValue &Flags,
43179 bool &isAnd) {
43180 if (Cond->getOpcode() == X86ISD::CMP) {
43181 if (!isNullConstant(Cond->getOperand(1)))
43182 return false;
43183
43184 Cond = Cond->getOperand(0);
43185 }
43186
43187 isAnd = false;
43188
43189 SDValue SetCC0, SetCC1;
43190 switch (Cond->getOpcode()) {
43191 default: return false;
43192 case ISD::AND:
43193 case X86ISD::AND:
43194 isAnd = true;
43195 LLVM_FALLTHROUGH[[gnu::fallthrough]];
43196 case ISD::OR:
43197 case X86ISD::OR:
43198 SetCC0 = Cond->getOperand(0);
43199 SetCC1 = Cond->getOperand(1);
43200 break;
43201 };
43202
43203 // Make sure we have SETCC nodes, using the same flags value.
43204 if (SetCC0.getOpcode() != X86ISD::SETCC ||
43205 SetCC1.getOpcode() != X86ISD::SETCC ||
43206 SetCC0->getOperand(1) != SetCC1->getOperand(1))
43207 return false;
43208
43209 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
43210 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
43211 Flags = SetCC0->getOperand(1);
43212 return true;
43213}
43214
43215// When legalizing carry, we create carries via add X, -1
43216// If that comes from an actual carry, via setcc, we use the
43217// carry directly.
43218static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
43219 if (EFLAGS.getOpcode() == X86ISD::ADD) {
43220 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
43221 SDValue Carry = EFLAGS.getOperand(0);
43222 while (Carry.getOpcode() == ISD::TRUNCATE ||
43223 Carry.getOpcode() == ISD::ZERO_EXTEND ||
43224 Carry.getOpcode() == ISD::SIGN_EXTEND ||
43225 Carry.getOpcode() == ISD::ANY_EXTEND ||
43226 (Carry.getOpcode() == ISD::AND &&
43227 isOneConstant(Carry.getOperand(1))))
43228 Carry = Carry.getOperand(0);
43229 if (Carry.getOpcode() == X86ISD::SETCC ||
43230 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
43231 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
43232 uint64_t CarryCC = Carry.getConstantOperandVal(0);
43233 SDValue CarryOp1 = Carry.getOperand(1);
43234 if (CarryCC == X86::COND_B)
43235 return CarryOp1;
43236 if (CarryCC == X86::COND_A) {
43237 // Try to convert COND_A into COND_B in an attempt to facilitate
43238 // materializing "setb reg".
43239 //
43240 // Do not flip "e > c", where "c" is a constant, because Cmp
43241 // instruction cannot take an immediate as its first operand.
43242 //
43243 if (CarryOp1.getOpcode() == X86ISD::SUB &&
43244 CarryOp1.getNode()->hasOneUse() &&
43245 CarryOp1.getValueType().isInteger() &&
43246 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
43247 SDValue SubCommute =
43248 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
43249 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
43250 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
43251 }
43252 }
43253 // If this is a check of the z flag of an add with 1, switch to the
43254 // C flag.
43255 if (CarryCC == X86::COND_E &&
43256 CarryOp1.getOpcode() == X86ISD::ADD &&
43257 isOneConstant(CarryOp1.getOperand(1)))
43258 return CarryOp1;
43259 }
43260 }
43261 }
43262
43263 return SDValue();
43264}
43265
43266/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
43267/// to avoid the inversion.
43268static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
43269 SelectionDAG &DAG,
43270 const X86Subtarget &Subtarget) {
43271 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
43272 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
43273 EFLAGS.getOpcode() != X86ISD::TESTP)
43274 return SDValue();
43275
43276 // PTEST/TESTP sets EFLAGS as:
43277 // TESTZ: ZF = (Op0 & Op1) == 0
43278 // TESTC: CF = (~Op0 & Op1) == 0
43279 // TESTNZC: ZF == 0 && CF == 0
43280 EVT VT = EFLAGS.getValueType();
43281 SDValue Op0 = EFLAGS.getOperand(0);
43282 SDValue Op1 = EFLAGS.getOperand(1);
43283 EVT OpVT = Op0.getValueType();
43284
43285 // TEST*(~X,Y) == TEST*(X,Y)
43286 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
43287 X86::CondCode InvCC;
43288 switch (CC) {
43289 case X86::COND_B:
43290 // testc -> testz.
43291 InvCC = X86::COND_E;
43292 break;
43293 case X86::COND_AE:
43294 // !testc -> !testz.
43295 InvCC = X86::COND_NE;
43296 break;
43297 case X86::COND_E:
43298 // testz -> testc.
43299 InvCC = X86::COND_B;
43300 break;
43301 case X86::COND_NE:
43302 // !testz -> !testc.
43303 InvCC = X86::COND_AE;
43304 break;
43305 case X86::COND_A:
43306 case X86::COND_BE:
43307 // testnzc -> testnzc (no change).
43308 InvCC = CC;
43309 break;
43310 default:
43311 InvCC = X86::COND_INVALID;
43312 break;
43313 }
43314
43315 if (InvCC != X86::COND_INVALID) {
43316 CC = InvCC;
43317 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
43318 DAG.getBitcast(OpVT, NotOp0), Op1);
43319 }
43320 }
43321
43322 if (CC == X86::COND_E || CC == X86::COND_NE) {
43323 // TESTZ(X,~Y) == TESTC(Y,X)
43324 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
43325 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
43326 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
43327 DAG.getBitcast(OpVT, NotOp1), Op0);
43328 }
43329
43330 if (Op0 == Op1) {
43331 SDValue BC = peekThroughBitcasts(Op0);
43332 EVT BCVT = BC.getValueType();
43333 assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43334, __extension__ __PRETTY_FUNCTION__))
43334 "Unexpected vector type")(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43334, __extension__ __PRETTY_FUNCTION__))
;
43335
43336 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
43337 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
43338 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
43339 DAG.getBitcast(OpVT, BC.getOperand(0)),
43340 DAG.getBitcast(OpVT, BC.getOperand(1)));
43341 }
43342
43343 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
43344 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
43345 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
43346 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
43347 DAG.getBitcast(OpVT, BC.getOperand(0)),
43348 DAG.getBitcast(OpVT, BC.getOperand(1)));
43349 }
43350
43351 // If every element is an all-sign value, see if we can use MOVMSK to
43352 // more efficiently extract the sign bits and compare that.
43353 // TODO: Handle TESTC with comparison inversion.
43354 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
43355 // MOVMSK combines to make sure its never worse than PTEST?
43356 unsigned EltBits = BCVT.getScalarSizeInBits();
43357 if (DAG.ComputeNumSignBits(BC) == EltBits) {
43358 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43358, __extension__ __PRETTY_FUNCTION__))
;
43359 APInt SignMask = APInt::getSignMask(EltBits);
43360 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43361 if (SDValue Res =
43362 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
43363 // For vXi16 cases we need to use pmovmksb and extract every other
43364 // sign bit.
43365 SDLoc DL(EFLAGS);
43366 if (EltBits == 16) {
43367 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
43368 Res = DAG.getBitcast(MovmskVT, Res);
43369 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
43370 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
43371 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
43372 } else {
43373 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
43374 }
43375 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
43376 DAG.getConstant(0, DL, MVT::i32));
43377 }
43378 }
43379 }
43380
43381 // TESTZ(-1,X) == TESTZ(X,X)
43382 if (ISD::isBuildVectorAllOnes(Op0.getNode()))
43383 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
43384
43385 // TESTZ(X,-1) == TESTZ(X,X)
43386 if (ISD::isBuildVectorAllOnes(Op1.getNode()))
43387 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
43388 }
43389
43390 return SDValue();
43391}
43392
43393// Attempt to simplify the MOVMSK input based on the comparison type.
43394static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
43395 SelectionDAG &DAG,
43396 const X86Subtarget &Subtarget) {
43397 // Handle eq/ne against zero (any_of).
43398 // Handle eq/ne against -1 (all_of).
43399 if (!(CC == X86::COND_E || CC == X86::COND_NE))
43400 return SDValue();
43401 if (EFLAGS.getValueType() != MVT::i32)
43402 return SDValue();
43403 unsigned CmpOpcode = EFLAGS.getOpcode();
43404 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
43405 return SDValue();
43406 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
43407 if (!CmpConstant)
43408 return SDValue();
43409 const APInt &CmpVal = CmpConstant->getAPIntValue();
43410
43411 SDValue CmpOp = EFLAGS.getOperand(0);
43412 unsigned CmpBits = CmpOp.getValueSizeInBits();
43413 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43413, __extension__ __PRETTY_FUNCTION__))
;
43414
43415 // Peek through any truncate.
43416 if (CmpOp.getOpcode() == ISD::TRUNCATE)
43417 CmpOp = CmpOp.getOperand(0);
43418
43419 // Bail if we don't find a MOVMSK.
43420 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
43421 return SDValue();
43422
43423 SDValue Vec = CmpOp.getOperand(0);
43424 MVT VecVT = Vec.getSimpleValueType();
43425 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43426, __extension__ __PRETTY_FUNCTION__))
43426 "Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43426, __extension__ __PRETTY_FUNCTION__))
;
43427 unsigned NumElts = VecVT.getVectorNumElements();
43428 unsigned NumEltBits = VecVT.getScalarSizeInBits();
43429
43430 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
43431 bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
43432 CmpVal.isMask(NumElts);
43433 if (!IsAnyOf && !IsAllOf)
43434 return SDValue();
43435
43436 // See if we can peek through to a vector with a wider element type, if the
43437 // signbits extend down to all the sub-elements as well.
43438 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
43439 // potential SimplifyDemandedBits/Elts cases.
43440 if (Vec.getOpcode() == ISD::BITCAST) {
43441 SDValue BC = peekThroughBitcasts(Vec);
43442 MVT BCVT = BC.getSimpleValueType();
43443 unsigned BCNumElts = BCVT.getVectorNumElements();
43444 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
43445 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
43446 BCNumEltBits > NumEltBits &&
43447 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
43448 SDLoc DL(EFLAGS);
43449 unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
43450 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
43451 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
43452 DAG.getConstant(CmpMask, DL, MVT::i32));
43453 }
43454 }
43455
43456 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
43457 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
43458 if (IsAllOf && Subtarget.hasSSE41()) {
43459 SDValue BC = peekThroughBitcasts(Vec);
43460 if (BC.getOpcode() == X86ISD::PCMPEQ &&
43461 ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
43462 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
43463 SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
43464 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
43465 }
43466 }
43467
43468 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
43469 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
43470 // sign bits prior to the comparison with zero unless we know that
43471 // the vXi16 splats the sign bit down to the lower i8 half.
43472 // TODO: Handle all_of patterns.
43473 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
43474 SDValue VecOp0 = Vec.getOperand(0);
43475 SDValue VecOp1 = Vec.getOperand(1);
43476 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
43477 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
43478 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
43479 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
43480 SDLoc DL(EFLAGS);
43481 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
43482 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
43483 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
43484 if (!SignExt0) {
43485 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
43486 DAG.getConstant(0xAAAA, DL, MVT::i16));
43487 }
43488 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
43489 DAG.getConstant(0, DL, MVT::i16));
43490 }
43491 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
43492 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
43493 if (CmpBits >= 16 && Subtarget.hasInt256() &&
43494 VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43495 VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43496 VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
43497 VecOp0.getConstantOperandAPInt(1) == 0 &&
43498 VecOp1.getConstantOperandAPInt(1) == 8 &&
43499 (IsAnyOf || (SignExt0 && SignExt1))) {
43500 SDLoc DL(EFLAGS);
43501 SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
43502 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
43503 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
43504 if (!SignExt0 || !SignExt1) {
43505 assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43505, __extension__ __PRETTY_FUNCTION__))
;
43506 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
43507 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
43508 }
43509 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
43510 DAG.getConstant(CmpMask, DL, MVT::i32));
43511 }
43512 }
43513
43514 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
43515 SmallVector<int, 32> ShuffleMask;
43516 SmallVector<SDValue, 2> ShuffleInputs;
43517 if (NumElts <= CmpBits &&
43518 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
43519 ShuffleMask, DAG) &&
43520 ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
43521 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
43522 unsigned NumShuffleElts = ShuffleMask.size();
43523 APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
43524 for (int M : ShuffleMask) {
43525 assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43525, __extension__ __PRETTY_FUNCTION__))
;
43526 DemandedElts.setBit(M);
43527 }
43528 if (DemandedElts.isAllOnesValue()) {
43529 SDLoc DL(EFLAGS);
43530 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
43531 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
43532 Result =
43533 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
43534 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
43535 EFLAGS.getOperand(1));
43536 }
43537 }
43538
43539 return SDValue();
43540}
43541
43542/// Optimize an EFLAGS definition used according to the condition code \p CC
43543/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
43544/// uses of chain values.
43545static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
43546 SelectionDAG &DAG,
43547 const X86Subtarget &Subtarget) {
43548 if (CC == X86::COND_B)
43549 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
43550 return Flags;
43551
43552 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
43553 return R;
43554
43555 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
43556 return R;
43557
43558 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
43559 return R;
43560
43561 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
43562}
43563
43564/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
43565static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
43566 TargetLowering::DAGCombinerInfo &DCI,
43567 const X86Subtarget &Subtarget) {
43568 SDLoc DL(N);
43569
43570 SDValue FalseOp = N->getOperand(0);
43571 SDValue TrueOp = N->getOperand(1);
43572 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
43573 SDValue Cond = N->getOperand(3);
43574
43575 // cmov X, X, ?, ? --> X
43576 if (TrueOp == FalseOp)
43577 return TrueOp;
43578
43579 // Try to simplify the EFLAGS and condition code operands.
43580 // We can't always do this as FCMOV only supports a subset of X86 cond.
43581 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
43582 if (!(FalseOp.getValueType() == MVT::f80 ||
43583 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
43584 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
43585 !Subtarget.hasCMov() || hasFPCMov(CC)) {
43586 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
43587 Flags};
43588 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43589 }
43590 }
43591
43592 // If this is a select between two integer constants, try to do some
43593 // optimizations. Note that the operands are ordered the opposite of SELECT
43594 // operands.
43595 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
43596 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
43597 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
43598 // larger than FalseC (the false value).
43599 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
43600 CC = X86::GetOppositeBranchCondition(CC);
43601 std::swap(TrueC, FalseC);
43602 std::swap(TrueOp, FalseOp);
43603 }
43604
43605 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
43606 // This is efficient for any integer data type (including i8/i16) and
43607 // shift amount.
43608 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
43609 Cond = getSETCC(CC, Cond, DL, DAG);
43610
43611 // Zero extend the condition if needed.
43612 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
43613
43614 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
43615 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
43616 DAG.getConstant(ShAmt, DL, MVT::i8));
43617 return Cond;
43618 }
43619
43620 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
43621 // for any integer data type, including i8/i16.
43622 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
43623 Cond = getSETCC(CC, Cond, DL, DAG);
43624
43625 // Zero extend the condition if needed.
43626 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
43627 FalseC->getValueType(0), Cond);
43628 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
43629 SDValue(FalseC, 0));
43630 return Cond;
43631 }
43632
43633 // Optimize cases that will turn into an LEA instruction. This requires
43634 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
43635 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
43636 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
43637 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43638, __extension__ __PRETTY_FUNCTION__))
43638 "Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43638, __extension__ __PRETTY_FUNCTION__))
;
43639
43640 bool isFastMultiplier = false;
43641 if (Diff.ult(10)) {
43642 switch (Diff.getZExtValue()) {
43643 default: break;
43644 case 1: // result = add base, cond
43645 case 2: // result = lea base( , cond*2)
43646 case 3: // result = lea base(cond, cond*2)
43647 case 4: // result = lea base( , cond*4)
43648 case 5: // result = lea base(cond, cond*4)
43649 case 8: // result = lea base( , cond*8)
43650 case 9: // result = lea base(cond, cond*8)
43651 isFastMultiplier = true;
43652 break;
43653 }
43654 }
43655
43656 if (isFastMultiplier) {
43657 Cond = getSETCC(CC, Cond, DL ,DAG);
43658 // Zero extend the condition if needed.
43659 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
43660 Cond);
43661 // Scale the condition by the difference.
43662 if (Diff != 1)
43663 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
43664 DAG.getConstant(Diff, DL, Cond.getValueType()));
43665
43666 // Add the base if non-zero.
43667 if (FalseC->getAPIntValue() != 0)
43668 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
43669 SDValue(FalseC, 0));
43670 return Cond;
43671 }
43672 }
43673 }
43674 }
43675
43676 // Handle these cases:
43677 // (select (x != c), e, c) -> select (x != c), e, x),
43678 // (select (x == c), c, e) -> select (x == c), x, e)
43679 // where the c is an integer constant, and the "select" is the combination
43680 // of CMOV and CMP.
43681 //
43682 // The rationale for this change is that the conditional-move from a constant
43683 // needs two instructions, however, conditional-move from a register needs
43684 // only one instruction.
43685 //
43686 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
43687 // some instruction-combining opportunities. This opt needs to be
43688 // postponed as late as possible.
43689 //
43690 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
43691 // the DCI.xxxx conditions are provided to postpone the optimization as
43692 // late as possible.
43693
43694 ConstantSDNode *CmpAgainst = nullptr;
43695 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
43696 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
43697 !isa<ConstantSDNode>(Cond.getOperand(0))) {
43698
43699 if (CC == X86::COND_NE &&
43700 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
43701 CC = X86::GetOppositeBranchCondition(CC);
43702 std::swap(TrueOp, FalseOp);
43703 }
43704
43705 if (CC == X86::COND_E &&
43706 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
43707 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
43708 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
43709 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43710 }
43711 }
43712 }
43713
43714 // Fold and/or of setcc's to double CMOV:
43715 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
43716 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
43717 //
43718 // This combine lets us generate:
43719 // cmovcc1 (jcc1 if we don't have CMOV)
43720 // cmovcc2 (same)
43721 // instead of:
43722 // setcc1
43723 // setcc2
43724 // and/or
43725 // cmovne (jne if we don't have CMOV)
43726 // When we can't use the CMOV instruction, it might increase branch
43727 // mispredicts.
43728 // When we can use CMOV, or when there is no mispredict, this improves
43729 // throughput and reduces register pressure.
43730 //
43731 if (CC == X86::COND_NE) {
43732 SDValue Flags;
43733 X86::CondCode CC0, CC1;
43734 bool isAndSetCC;
43735 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
43736 if (isAndSetCC) {
43737 std::swap(FalseOp, TrueOp);
43738 CC0 = X86::GetOppositeBranchCondition(CC0);
43739 CC1 = X86::GetOppositeBranchCondition(CC1);
43740 }
43741
43742 SDValue LOps[] = {FalseOp, TrueOp,
43743 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
43744 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
43745 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
43746 Flags};
43747 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
43748 return CMOV;
43749 }
43750 }
43751
43752 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
43753 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
43754 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
43755 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
43756 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
43757 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
43758 SDValue Add = TrueOp;
43759 SDValue Const = FalseOp;
43760 // Canonicalize the condition code for easier matching and output.
43761 if (CC == X86::COND_E)
43762 std::swap(Add, Const);
43763
43764 // We might have replaced the constant in the cmov with the LHS of the
43765 // compare. If so change it to the RHS of the compare.
43766 if (Const == Cond.getOperand(0))
43767 Const = Cond.getOperand(1);
43768
43769 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
43770 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
43771 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
43772 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
43773 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
43774 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
43775 EVT VT = N->getValueType(0);
43776 // This should constant fold.
43777 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
43778 SDValue CMov =
43779 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
43780 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
43781 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
43782 }
43783 }
43784
43785 return SDValue();
43786}
43787
43788/// Different mul shrinking modes.
43789enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
43790
43791static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
43792 EVT VT = N->getOperand(0).getValueType();
43793 if (VT.getScalarSizeInBits() != 32)
43794 return false;
43795
43796 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43796, __extension__ __PRETTY_FUNCTION__))
;
43797 unsigned SignBits[2] = {1, 1};
43798 bool IsPositive[2] = {false, false};
43799 for (unsigned i = 0; i < 2; i++) {
43800 SDValue Opd = N->getOperand(i);
43801
43802 SignBits[i] = DAG.ComputeNumSignBits(Opd);
43803 IsPositive[i] = DAG.SignBitIsZero(Opd);
43804 }
43805
43806 bool AllPositive = IsPositive[0] && IsPositive[1];
43807 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
43808 // When ranges are from -128 ~ 127, use MULS8 mode.
43809 if (MinSignBits >= 25)
43810 Mode = ShrinkMode::MULS8;
43811 // When ranges are from 0 ~ 255, use MULU8 mode.
43812 else if (AllPositive && MinSignBits >= 24)
43813 Mode = ShrinkMode::MULU8;
43814 // When ranges are from -32768 ~ 32767, use MULS16 mode.
43815 else if (MinSignBits >= 17)
43816 Mode = ShrinkMode::MULS16;
43817 // When ranges are from 0 ~ 65535, use MULU16 mode.
43818 else if (AllPositive && MinSignBits >= 16)
43819 Mode = ShrinkMode::MULU16;
43820 else
43821 return false;
43822 return true;
43823}
43824
43825/// When the operands of vector mul are extended from smaller size values,
43826/// like i8 and i16, the type of mul may be shrinked to generate more
43827/// efficient code. Two typical patterns are handled:
43828/// Pattern1:
43829/// %2 = sext/zext <N x i8> %1 to <N x i32>
43830/// %4 = sext/zext <N x i8> %3 to <N x i32>
43831// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43832/// %5 = mul <N x i32> %2, %4
43833///
43834/// Pattern2:
43835/// %2 = zext/sext <N x i16> %1 to <N x i32>
43836/// %4 = zext/sext <N x i16> %3 to <N x i32>
43837/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
43838/// %5 = mul <N x i32> %2, %4
43839///
43840/// There are four mul shrinking modes:
43841/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
43842/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
43843/// generate pmullw+sext32 for it (MULS8 mode).
43844/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
43845/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
43846/// generate pmullw+zext32 for it (MULU8 mode).
43847/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
43848/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
43849/// generate pmullw+pmulhw for it (MULS16 mode).
43850/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
43851/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
43852/// generate pmullw+pmulhuw for it (MULU16 mode).
43853static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
43854 const X86Subtarget &Subtarget) {
43855 // Check for legality
43856 // pmullw/pmulhw are not supported by SSE.
43857 if (!Subtarget.hasSSE2())
43858 return SDValue();
43859
43860 // Check for profitability
43861 // pmulld is supported since SSE41. It is better to use pmulld
43862 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
43863 // the expansion.
43864 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
43865 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
43866 return SDValue();
43867
43868 ShrinkMode Mode;
43869 if (!canReduceVMulWidth(N, DAG, Mode))
43870 return SDValue();
43871
43872 SDLoc DL(N);
43873 SDValue N0 = N->getOperand(0);
43874 SDValue N1 = N->getOperand(1);
43875 EVT VT = N->getOperand(0).getValueType();
43876 unsigned NumElts = VT.getVectorNumElements();
43877 if ((NumElts % 2) != 0)
43878 return SDValue();
43879
43880 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
43881
43882 // Shrink the operands of mul.
43883 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
43884 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
43885
43886 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
43887 // lower part is needed.
43888 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
43889 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
43890 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
43891 : ISD::SIGN_EXTEND,
43892 DL, VT, MulLo);
43893
43894 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
43895 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
43896 // the higher part is also needed.
43897 SDValue MulHi =
43898 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
43899 ReducedVT, NewN0, NewN1);
43900
43901 // Repack the lower part and higher part result of mul into a wider
43902 // result.
43903 // Generate shuffle functioning as punpcklwd.
43904 SmallVector<int, 16> ShuffleMask(NumElts);
43905 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43906 ShuffleMask[2 * i] = i;
43907 ShuffleMask[2 * i + 1] = i + NumElts;
43908 }
43909 SDValue ResLo =
43910 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43911 ResLo = DAG.getBitcast(ResVT, ResLo);
43912 // Generate shuffle functioning as punpckhwd.
43913 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
43914 ShuffleMask[2 * i] = i + NumElts / 2;
43915 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
43916 }
43917 SDValue ResHi =
43918 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
43919 ResHi = DAG.getBitcast(ResVT, ResHi);
43920 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
43921}
43922
43923static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
43924 EVT VT, const SDLoc &DL) {
43925
43926 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
43927 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43928 DAG.getConstant(Mult, DL, VT));
43929 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
43930 DAG.getConstant(Shift, DL, MVT::i8));
43931 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43932 N->getOperand(0));
43933 return Result;
43934 };
43935
43936 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
43937 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
43938 DAG.getConstant(Mul1, DL, VT));
43939 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
43940 DAG.getConstant(Mul2, DL, VT));
43941 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
43942 N->getOperand(0));
43943 return Result;
43944 };
43945
43946 switch (MulAmt) {
43947 default:
43948 break;
43949 case 11:
43950 // mul x, 11 => add ((shl (mul x, 5), 1), x)
43951 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
43952 case 21:
43953 // mul x, 21 => add ((shl (mul x, 5), 2), x)
43954 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
43955 case 41:
43956 // mul x, 41 => add ((shl (mul x, 5), 3), x)
43957 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
43958 case 22:
43959 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
43960 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43961 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
43962 case 19:
43963 // mul x, 19 => add ((shl (mul x, 9), 1), x)
43964 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
43965 case 37:
43966 // mul x, 37 => add ((shl (mul x, 9), 2), x)
43967 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
43968 case 73:
43969 // mul x, 73 => add ((shl (mul x, 9), 3), x)
43970 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
43971 case 13:
43972 // mul x, 13 => add ((shl (mul x, 3), 2), x)
43973 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
43974 case 23:
43975 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
43976 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
43977 case 26:
43978 // mul x, 26 => add ((mul (mul x, 5), 5), x)
43979 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
43980 case 28:
43981 // mul x, 28 => add ((mul (mul x, 9), 3), x)
43982 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
43983 case 29:
43984 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
43985 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
43986 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
43987 }
43988
43989 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
43990 // by a single LEA.
43991 // First check if this a sum of two power of 2s because that's easy. Then
43992 // count how many zeros are up to the first bit.
43993 // TODO: We can do this even without LEA at a cost of two shifts and an add.
43994 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
43995 unsigned ScaleShift = countTrailingZeros(MulAmt);
43996 if (ScaleShift >= 1 && ScaleShift < 4) {
43997 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
43998 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
43999 DAG.getConstant(ShiftAmt, DL, MVT::i8));
44000 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44001 DAG.getConstant(ScaleShift, DL, MVT::i8));
44002 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
44003 }
44004 }
44005
44006 return SDValue();
44007}
44008
44009// If the upper 17 bits of each element are zero then we can use PMADDWD,
44010// which is always at least as quick as PMULLD, except on KNL.
44011static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
44012 const X86Subtarget &Subtarget) {
44013 if (!Subtarget.hasSSE2())
44014 return SDValue();
44015
44016 if (Subtarget.isPMADDWDSlow())
44017 return SDValue();
44018
44019 EVT VT = N->getValueType(0);
44020
44021 // Only support vXi32 vectors.
44022 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
44023 return SDValue();
44024
44025 // Make sure the type is legal or will be widened to a legal type.
44026 if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
44027 return SDValue();
44028
44029 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
44030
44031 // Without BWI, we would need to split v32i16.
44032 if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
44033 return SDValue();
44034
44035 SDValue N0 = N->getOperand(0);
44036 SDValue N1 = N->getOperand(1);
44037
44038 // If we are zero extending two steps without SSE4.1, its better to reduce
44039 // the vmul width instead.
44040 if (!Subtarget.hasSSE41() &&
44041 (N0.getOpcode() == ISD::ZERO_EXTEND &&
44042 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
44043 (N1.getOpcode() == ISD::ZERO_EXTEND &&
44044 N1.getOperand(0).getScalarValueSizeInBits() <= 8))
44045 return SDValue();
44046
44047 APInt Mask17 = APInt::getHighBitsSet(32, 17);
44048 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
44049 !DAG.MaskedValueIsZero(N0, Mask17))
44050 return SDValue();
44051
44052 // Use SplitOpsAndApply to handle AVX splitting.
44053 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44054 ArrayRef<SDValue> Ops) {
44055 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
44056 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
44057 };
44058 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
44059 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
44060 PMADDWDBuilder);
44061}
44062
44063static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
44064 const X86Subtarget &Subtarget) {
44065 if (!Subtarget.hasSSE2())
44066 return SDValue();
44067
44068 EVT VT = N->getValueType(0);
44069
44070 // Only support vXi64 vectors.
44071 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
44072 VT.getVectorNumElements() < 2 ||
44073 !isPowerOf2_32(VT.getVectorNumElements()))
44074 return SDValue();
44075
44076 SDValue N0 = N->getOperand(0);
44077 SDValue N1 = N->getOperand(1);
44078
44079 // MULDQ returns the 64-bit result of the signed multiplication of the lower
44080 // 32-bits. We can lower with this if the sign bits stretch that far.
44081 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
44082 DAG.ComputeNumSignBits(N1) > 32) {
44083 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44084 ArrayRef<SDValue> Ops) {
44085 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
44086 };
44087 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
44088 PMULDQBuilder, /*CheckBWI*/false);
44089 }
44090
44091 // If the upper bits are zero we can use a single pmuludq.
44092 APInt Mask = APInt::getHighBitsSet(64, 32);
44093 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
44094 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44095 ArrayRef<SDValue> Ops) {
44096 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
44097 };
44098 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
44099 PMULUDQBuilder, /*CheckBWI*/false);
44100 }
44101
44102 return SDValue();
44103}
44104
44105static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
44106 TargetLowering::DAGCombinerInfo &DCI,
44107 const X86Subtarget &Subtarget) {
44108 EVT VT = N->getValueType(0);
44109
44110 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
44111 return V;
44112
44113 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
44114 return V;
44115
44116 if (DCI.isBeforeLegalize() && VT.isVector())
44117 return reduceVMULWidth(N, DAG, Subtarget);
44118
44119 // Optimize a single multiply with constant into two operations in order to
44120 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
44121 if (!MulConstantOptimization)
44122 return SDValue();
44123
44124 // An imul is usually smaller than the alternative sequence.
44125 if (DAG.getMachineFunction().getFunction().hasMinSize())
44126 return SDValue();
44127
44128 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
44129 return SDValue();
44130
44131 if (VT != MVT::i64 && VT != MVT::i32)
44132 return SDValue();
44133
44134 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
44135 if (!C)
44136 return SDValue();
44137 if (isPowerOf2_64(C->getZExtValue()))
44138 return SDValue();
44139
44140 int64_t SignMulAmt = C->getSExtValue();
44141 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44141, __extension__ __PRETTY_FUNCTION__))
;
44142 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
44143
44144 SDLoc DL(N);
44145 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
44146 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
44147 DAG.getConstant(AbsMulAmt, DL, VT));
44148 if (SignMulAmt < 0)
44149 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
44150 NewMul);
44151
44152 return NewMul;
44153 }
44154
44155 uint64_t MulAmt1 = 0;
44156 uint64_t MulAmt2 = 0;
44157 if ((AbsMulAmt % 9) == 0) {
44158 MulAmt1 = 9;
44159 MulAmt2 = AbsMulAmt / 9;
44160 } else if ((AbsMulAmt % 5) == 0) {
44161 MulAmt1 = 5;
44162 MulAmt2 = AbsMulAmt / 5;
44163 } else if ((AbsMulAmt % 3) == 0) {
44164 MulAmt1 = 3;
44165 MulAmt2 = AbsMulAmt / 3;
44166 }
44167
44168 SDValue NewMul;
44169 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
44170 if (MulAmt2 &&
44171 (isPowerOf2_64(MulAmt2) ||
44172 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
44173
44174 if (isPowerOf2_64(MulAmt2) &&
44175 !(SignMulAmt >= 0 && N->hasOneUse() &&
44176 N->use_begin()->getOpcode() == ISD::ADD))
44177 // If second multiplifer is pow2, issue it first. We want the multiply by
44178 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
44179 // is an add. Only do this for positive multiply amounts since the
44180 // negate would prevent it from being used as an address mode anyway.
44181 std::swap(MulAmt1, MulAmt2);
44182
44183 if (isPowerOf2_64(MulAmt1))
44184 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44185 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
44186 else
44187 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
44188 DAG.getConstant(MulAmt1, DL, VT));
44189
44190 if (isPowerOf2_64(MulAmt2))
44191 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
44192 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
44193 else
44194 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
44195 DAG.getConstant(MulAmt2, DL, VT));
44196
44197 // Negate the result.
44198 if (SignMulAmt < 0)
44199 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
44200 NewMul);
44201 } else if (!Subtarget.slowLEA())
44202 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
44203
44204 if (!NewMul) {
44205 assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44208, __extension__ __PRETTY_FUNCTION__))
44206 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44208, __extension__ __PRETTY_FUNCTION__))
44207 "Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44208, __extension__ __PRETTY_FUNCTION__))
44208 "already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44208, __extension__ __PRETTY_FUNCTION__))
;
44209 if (isPowerOf2_64(AbsMulAmt - 1)) {
44210 // (mul x, 2^N + 1) => (add (shl x, N), x)
44211 NewMul = DAG.getNode(
44212 ISD::ADD, DL, VT, N->getOperand(0),
44213 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44214 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
44215 MVT::i8)));
44216 // To negate, subtract the number from zero
44217 if (SignMulAmt < 0)
44218 NewMul = DAG.getNode(ISD::SUB, DL, VT,
44219 DAG.getConstant(0, DL, VT), NewMul);
44220 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
44221 // (mul x, 2^N - 1) => (sub (shl x, N), x)
44222 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44223 DAG.getConstant(Log2_64(AbsMulAmt + 1),
44224 DL, MVT::i8));
44225 // To negate, reverse the operands of the subtract.
44226 if (SignMulAmt < 0)
44227 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
44228 else
44229 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
44230 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
44231 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
44232 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44233 DAG.getConstant(Log2_64(AbsMulAmt - 2),
44234 DL, MVT::i8));
44235 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
44236 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
44237 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
44238 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
44239 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
44240 DAG.getConstant(Log2_64(AbsMulAmt + 2),
44241 DL, MVT::i8));
44242 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
44243 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
44244 }
44245 }
44246
44247 return NewMul;
44248}
44249
44250// Try to form a MULHU or MULHS node by looking for
44251// (srl (mul ext, ext), 16)
44252// TODO: This is X86 specific because we want to be able to handle wide types
44253// before type legalization. But we can only do it if the vector will be
44254// legalized via widening/splitting. Type legalization can't handle promotion
44255// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
44256// combiner.
44257static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
44258 const X86Subtarget &Subtarget) {
44259 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44260, __extension__ __PRETTY_FUNCTION__))
44260 "SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44260, __extension__ __PRETTY_FUNCTION__))
;
44261 SDLoc DL(N);
44262
44263 // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
44264 // the multiply.
44265 if (!Subtarget.hasSSE41())
44266 return SDValue();
44267
44268 // The operation feeding into the shift must be a multiply.
44269 SDValue ShiftOperand = N->getOperand(0);
44270 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
44271 return SDValue();
44272
44273 // Input type should be at least vXi32.
44274 EVT VT = N->getValueType(0);
44275 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
44276 return SDValue();
44277
44278 // Need a shift by 16.
44279 APInt ShiftAmt;
44280 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
44281 ShiftAmt != 16)
44282 return SDValue();
44283
44284 SDValue LHS = ShiftOperand.getOperand(0);
44285 SDValue RHS = ShiftOperand.getOperand(1);
44286
44287 unsigned ExtOpc = LHS.getOpcode();
44288 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
44289 RHS.getOpcode() != ExtOpc)
44290 return SDValue();
44291
44292 // Peek through the extends.
44293 LHS = LHS.getOperand(0);
44294 RHS = RHS.getOperand(0);
44295
44296 // Ensure the input types match.
44297 EVT MulVT = LHS.getValueType();
44298 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
44299 return SDValue();
44300
44301 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
44302 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
44303
44304 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
44305 return DAG.getNode(ExtOpc, DL, VT, Mulh);
44306}
44307
44308static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
44309 SDValue N0 = N->getOperand(0);
44310 SDValue N1 = N->getOperand(1);
44311 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
44312 EVT VT = N0.getValueType();
44313
44314 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
44315 // since the result of setcc_c is all zero's or all ones.
44316 if (VT.isInteger() && !VT.isVector() &&
44317 N1C && N0.getOpcode() == ISD::AND &&
44318 N0.getOperand(1).getOpcode() == ISD::Constant) {
44319 SDValue N00 = N0.getOperand(0);
44320 APInt Mask = N0.getConstantOperandAPInt(1);
44321 Mask <<= N1C->getAPIntValue();
44322 bool MaskOK = false;
44323 // We can handle cases concerning bit-widening nodes containing setcc_c if
44324 // we carefully interrogate the mask to make sure we are semantics
44325 // preserving.
44326 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
44327 // of the underlying setcc_c operation if the setcc_c was zero extended.
44328 // Consider the following example:
44329 // zext(setcc_c) -> i32 0x0000FFFF
44330 // c1 -> i32 0x0000FFFF
44331 // c2 -> i32 0x00000001
44332 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
44333 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
44334 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
44335 MaskOK = true;
44336 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
44337 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
44338 MaskOK = true;
44339 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
44340 N00.getOpcode() == ISD::ANY_EXTEND) &&
44341 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
44342 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
44343 }
44344 if (MaskOK && Mask != 0) {
44345 SDLoc DL(N);
44346 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
44347 }
44348 }
44349
44350 // Hardware support for vector shifts is sparse which makes us scalarize the
44351 // vector operations in many cases. Also, on sandybridge ADD is faster than
44352 // shl.
44353 // (shl V, 1) -> add V,V
44354 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
44355 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
44356 assert(N0.getValueType().isVector() && "Invalid vector shift type")(static_cast <bool> (N0.getValueType().isVector() &&
"Invalid vector shift type") ? void (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44356, __extension__ __PRETTY_FUNCTION__))
;
44357 // We shift all of the values by one. In many cases we do not have
44358 // hardware support for this operation. This is better expressed as an ADD
44359 // of two values.
44360 if (N1SplatC->isOne())
44361 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
44362 }
44363
44364 return SDValue();
44365}
44366
44367static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
44368 const X86Subtarget &Subtarget) {
44369 SDValue N0 = N->getOperand(0);
44370 SDValue N1 = N->getOperand(1);
44371 EVT VT = N0.getValueType();
44372 unsigned Size = VT.getSizeInBits();
44373
44374 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
44375 return V;
44376
44377 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
44378 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
44379 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
44380 // depending on sign of (SarConst - [56,48,32,24,16])
44381
44382 // sexts in X86 are MOVs. The MOVs have the same code size
44383 // as above SHIFTs (only SHIFT on 1 has lower code size).
44384 // However the MOVs have 2 advantages to a SHIFT:
44385 // 1. MOVs can write to a register that differs from source
44386 // 2. MOVs accept memory operands
44387
44388 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
44389 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
44390 N0.getOperand(1).getOpcode() != ISD::Constant)
44391 return SDValue();
44392
44393 SDValue N00 = N0.getOperand(0);
44394 SDValue N01 = N0.getOperand(1);
44395 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
44396 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
44397 EVT CVT = N1.getValueType();
44398
44399 if (SarConst.isNegative())
44400 return SDValue();
44401
44402 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
44403 unsigned ShiftSize = SVT.getSizeInBits();
44404 // skipping types without corresponding sext/zext and
44405 // ShlConst that is not one of [56,48,32,24,16]
44406 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
44407 continue;
44408 SDLoc DL(N);
44409 SDValue NN =
44410 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
44411 SarConst = SarConst - (Size - ShiftSize);
44412 if (SarConst == 0)
44413 return NN;
44414 else if (SarConst.isNegative())
44415 return DAG.getNode(ISD::SHL, DL, VT, NN,
44416 DAG.getConstant(-SarConst, DL, CVT));
44417 else
44418 return DAG.getNode(ISD::SRA, DL, VT, NN,
44419 DAG.getConstant(SarConst, DL, CVT));
44420 }
44421 return SDValue();
44422}
44423
44424static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
44425 TargetLowering::DAGCombinerInfo &DCI,
44426 const X86Subtarget &Subtarget) {
44427 SDValue N0 = N->getOperand(0);
44428 SDValue N1 = N->getOperand(1);
44429 EVT VT = N0.getValueType();
44430
44431 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
44432 return V;
44433
44434 // Only do this on the last DAG combine as it can interfere with other
44435 // combines.
44436 if (!DCI.isAfterLegalizeDAG())
44437 return SDValue();
44438
44439 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
44440 // TODO: This is a generic DAG combine that became an x86-only combine to
44441 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
44442 // and-not ('andn').
44443 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
44444 return SDValue();
44445
44446 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
44447 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
44448 if (!ShiftC || !AndC)
44449 return SDValue();
44450
44451 // If we can shrink the constant mask below 8-bits or 32-bits, then this
44452 // transform should reduce code size. It may also enable secondary transforms
44453 // from improved known-bits analysis or instruction selection.
44454 APInt MaskVal = AndC->getAPIntValue();
44455
44456 // If this can be matched by a zero extend, don't optimize.
44457 if (MaskVal.isMask()) {
44458 unsigned TO = MaskVal.countTrailingOnes();
44459 if (TO >= 8 && isPowerOf2_32(TO))
44460 return SDValue();
44461 }
44462
44463 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
44464 unsigned OldMaskSize = MaskVal.getMinSignedBits();
44465 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
44466 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
44467 (OldMaskSize > 32 && NewMaskSize <= 32)) {
44468 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
44469 SDLoc DL(N);
44470 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
44471 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
44472 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
44473 }
44474 return SDValue();
44475}
44476
44477static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
44478 const X86Subtarget &Subtarget) {
44479 unsigned Opcode = N->getOpcode();
44480 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44480, __extension__ __PRETTY_FUNCTION__))
;
44481
44482 SDLoc DL(N);
44483 EVT VT = N->getValueType(0);
44484 SDValue N0 = N->getOperand(0);
44485 SDValue N1 = N->getOperand(1);
44486 EVT SrcVT = N0.getValueType();
44487
44488 SDValue BC0 =
44489 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
44490 SDValue BC1 =
44491 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
44492
44493 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
44494 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
44495 // truncation trees that help us avoid lane crossing shuffles.
44496 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
44497 // TODO: We don't handle vXf64 shuffles yet.
44498 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
44499 BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44500 BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
44501 BC0.getOperand(0) == BC1.getOperand(0) &&
44502 BC0.getOperand(0).getValueType().is256BitVector() &&
44503 BC0.getConstantOperandAPInt(1) == 0 &&
44504 BC1.getConstantOperandAPInt(1) ==
44505 BC0.getValueType().getVectorNumElements()) {
44506 SmallVector<SDValue> ShuffleOps;
44507 SmallVector<int> ShuffleMask, ScaledMask;
44508 SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
44509 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
44510 resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
44511 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
44512 // shuffle to a v4X64 width - we can probably relax this in the future.
44513 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
44514 ShuffleOps[0].getValueType().is256BitVector() &&
44515 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
44516 SDValue Lo, Hi;
44517 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
44518 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
44519 Lo = DAG.getBitcast(SrcVT, Lo);
44520 Hi = DAG.getBitcast(SrcVT, Hi);
44521 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
44522 Res = DAG.getBitcast(ShufVT, Res);
44523 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
44524 return DAG.getBitcast(VT, Res);
44525 }
44526 }
44527 }
44528
44529 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
44530 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
44531 // If either/both ops are a shuffle that can scale to v2x64,
44532 // then see if we can perform this as a v4x32 post shuffle.
44533 SmallVector<SDValue> Ops0, Ops1;
44534 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
44535 bool IsShuf0 =
44536 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
44537 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
44538 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
44539 bool IsShuf1 =
44540 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
44541 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
44542 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
44543 if (IsShuf0 || IsShuf1) {
44544 if (!IsShuf0) {
44545 Ops0.assign({BC0});
44546 ScaledMask0.assign({0, 1});
44547 }
44548 if (!IsShuf1) {
44549 Ops1.assign({BC1});
44550 ScaledMask1.assign({0, 1});
44551 }
44552
44553 SDValue LHS, RHS;
44554 int PostShuffle[4] = {-1, -1, -1, -1};
44555 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
44556 if (M < 0)
44557 return true;
44558 Idx = M % 2;
44559 SDValue Src = Ops[M / 2];
44560 if (!LHS || LHS == Src) {
44561 LHS = Src;
44562 return true;
44563 }
44564 if (!RHS || RHS == Src) {
44565 Idx += 2;
44566 RHS = Src;
44567 return true;
44568 }
44569 return false;
44570 };
44571 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
44572 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
44573 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
44574 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
44575 LHS = DAG.getBitcast(SrcVT, LHS);
44576 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
44577 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
44578 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
44579 Res = DAG.getBitcast(ShufVT, Res);
44580 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
44581 return DAG.getBitcast(VT, Res);
44582 }
44583 }
44584 }
44585
44586 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
44587 if (VT.is256BitVector() && Subtarget.hasInt256()) {
44588 SmallVector<int> Mask0, Mask1;
44589 SmallVector<SDValue> Ops0, Ops1;
44590 SmallVector<int, 2> ScaledMask0, ScaledMask1;
44591 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
44592 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
44593 !Ops0.empty() && !Ops1.empty() &&
44594 all_of(Ops0,
44595 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
44596 all_of(Ops1,
44597 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
44598 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
44599 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
44600 SDValue Op00 = peekThroughBitcasts(Ops0.front());
44601 SDValue Op10 = peekThroughBitcasts(Ops1.front());
44602 SDValue Op01 = peekThroughBitcasts(Ops0.back());
44603 SDValue Op11 = peekThroughBitcasts(Ops1.back());
44604 if ((Op00 == Op11) && (Op01 == Op10)) {
44605 std::swap(Op10, Op11);
44606 ShuffleVectorSDNode::commuteMask(ScaledMask1);
44607 }
44608 if ((Op00 == Op10) && (Op01 == Op11)) {
44609 const int Map[4] = {0, 2, 1, 3};
44610 SmallVector<int, 4> ShuffleMask(
44611 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
44612 Map[ScaledMask1[1]]});
44613 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
44614 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
44615 DAG.getBitcast(SrcVT, Op01));
44616 Res = DAG.getBitcast(ShufVT, Res);
44617 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
44618 return DAG.getBitcast(VT, Res);
44619 }
44620 }
44621 }
44622
44623 return SDValue();
44624}
44625
44626static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
44627 TargetLowering::DAGCombinerInfo &DCI,
44628 const X86Subtarget &Subtarget) {
44629 unsigned Opcode = N->getOpcode();
44630 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44631, __extension__ __PRETTY_FUNCTION__))
44631 "Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44631, __extension__ __PRETTY_FUNCTION__))
;
44632
44633 EVT VT = N->getValueType(0);
44634 SDValue N0 = N->getOperand(0);
44635 SDValue N1 = N->getOperand(1);
44636 unsigned NumDstElts = VT.getVectorNumElements();
44637 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
44638 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
44639 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44641, __extension__ __PRETTY_FUNCTION__))
44640 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44641, __extension__ __PRETTY_FUNCTION__))
44641 "Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44641, __extension__ __PRETTY_FUNCTION__))
;
44642
44643 bool IsSigned = (X86ISD::PACKSS == Opcode);
44644
44645 // Constant Folding.
44646 APInt UndefElts0, UndefElts1;
44647 SmallVector<APInt, 32> EltBits0, EltBits1;
44648 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
44649 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
44650 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
44651 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
44652 unsigned NumLanes = VT.getSizeInBits() / 128;
44653 unsigned NumSrcElts = NumDstElts / 2;
44654 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
44655 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
44656
44657 APInt Undefs(NumDstElts, 0);
44658 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
44659 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
44660 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
44661 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
44662 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
44663 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
44664
44665 if (UndefElts[SrcIdx]) {
44666 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
44667 continue;
44668 }
44669
44670 APInt &Val = EltBits[SrcIdx];
44671 if (IsSigned) {
44672 // PACKSS: Truncate signed value with signed saturation.
44673 // Source values less than dst minint are saturated to minint.
44674 // Source values greater than dst maxint are saturated to maxint.
44675 if (Val.isSignedIntN(DstBitsPerElt))
44676 Val = Val.trunc(DstBitsPerElt);
44677 else if (Val.isNegative())
44678 Val = APInt::getSignedMinValue(DstBitsPerElt);
44679 else
44680 Val = APInt::getSignedMaxValue(DstBitsPerElt);
44681 } else {
44682 // PACKUS: Truncate signed value with unsigned saturation.
44683 // Source values less than zero are saturated to zero.
44684 // Source values greater than dst maxuint are saturated to maxuint.
44685 if (Val.isIntN(DstBitsPerElt))
44686 Val = Val.trunc(DstBitsPerElt);
44687 else if (Val.isNegative())
44688 Val = APInt::getNullValue(DstBitsPerElt);
44689 else
44690 Val = APInt::getAllOnesValue(DstBitsPerElt);
44691 }
44692 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
44693 }
44694 }
44695
44696 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
44697 }
44698
44699 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
44700 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44701 return V;
44702
44703 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
44704 // truncate to create a larger truncate.
44705 if (Subtarget.hasAVX512() &&
44706 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
44707 N0.getOperand(0).getValueType() == MVT::v8i32) {
44708 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
44709 (!IsSigned &&
44710 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
44711 if (Subtarget.hasVLX())
44712 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
44713
44714 // Widen input to v16i32 so we can truncate that.
44715 SDLoc dl(N);
44716 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
44717 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
44718 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
44719 }
44720 }
44721
44722 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
44723 if (VT.is128BitVector()) {
44724 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
44725 SDValue Src0, Src1;
44726 if (N0.getOpcode() == ExtOpc &&
44727 N0.getOperand(0).getValueType().is64BitVector() &&
44728 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44729 Src0 = N0.getOperand(0);
44730 }
44731 if (N1.getOpcode() == ExtOpc &&
44732 N1.getOperand(0).getValueType().is64BitVector() &&
44733 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
44734 Src1 = N1.getOperand(0);
44735 }
44736 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
44737 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44737, __extension__ __PRETTY_FUNCTION__))
;
44738 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
44739 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
44740 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
44741 }
44742 }
44743
44744 // Attempt to combine as shuffle.
44745 SDValue Op(N, 0);
44746 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44747 return Res;
44748
44749 return SDValue();
44750}
44751
44752static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,
44753 TargetLowering::DAGCombinerInfo &DCI,
44754 const X86Subtarget &Subtarget) {
44755 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44757, __extension__ __PRETTY_FUNCTION__))
44756 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44757, __extension__ __PRETTY_FUNCTION__))
44757 "Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44757, __extension__ __PRETTY_FUNCTION__))
;
44758
44759 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
44760 // For slow-hop targets, if we have a hop with a single op, see if we already
44761 // have another user that we can reuse and shuffle the result.
44762 MVT VT = N->getSimpleValueType(0);
44763 SDValue LHS = N->getOperand(0);
44764 SDValue RHS = N->getOperand(1);
44765 if (VT.is128BitVector() && LHS == RHS) {
44766 for (SDNode *User : LHS->uses()) {
44767 if (User != N && User->getOpcode() == N->getOpcode()) {
44768 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
44769 if (User->getOperand(0) == LHS && !User->getOperand(1).isUndef()) {
44770 return DAG.getBitcast(
44771 VT,
44772 DAG.getVectorShuffle(ShufVT, SDLoc(N),
44773 DAG.getBitcast(ShufVT, SDValue(User, 0)),
44774 DAG.getUNDEF(ShufVT), {0, 1, 0, 1}));
44775 }
44776 if (User->getOperand(1) == LHS && !User->getOperand(0).isUndef()) {
44777 return DAG.getBitcast(
44778 VT,
44779 DAG.getVectorShuffle(ShufVT, SDLoc(N),
44780 DAG.getBitcast(ShufVT, SDValue(User, 0)),
44781 DAG.getUNDEF(ShufVT), {2, 3, 2, 3}));
44782 }
44783 }
44784 }
44785 }
44786
44787 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
44788 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
44789 LHS.getOpcode() == RHS.getOpcode() &&
44790 LHS.getValueType() == RHS.getValueType()) {
44791 SDValue LHS0 = LHS.getOperand(0);
44792 SDValue RHS0 = LHS.getOperand(1);
44793 SDValue LHS1 = RHS.getOperand(0);
44794 SDValue RHS1 = RHS.getOperand(1);
44795 if ((LHS0 == RHS0 || LHS0.isUndef() || RHS0.isUndef()) &&
44796 (LHS1 == RHS1 || LHS1.isUndef() || RHS1.isUndef())) {
44797 SDLoc DL(N);
44798 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
44799 LHS0.isUndef() ? RHS0 : LHS0,
44800 LHS1.isUndef() ? RHS1 : LHS1);
44801 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
44802 Res = DAG.getBitcast(ShufVT, Res);
44803 SDValue NewLHS =
44804 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44805 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
44806 SDValue NewRHS =
44807 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
44808 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
44809 DAG.ReplaceAllUsesOfValueWith(LHS, DAG.getBitcast(VT, NewLHS));
44810 DAG.ReplaceAllUsesOfValueWith(RHS, DAG.getBitcast(VT, NewRHS));
44811 return SDValue(N, 0);
44812 }
44813 }
44814 }
44815
44816 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
44817 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
44818 return V;
44819
44820 return SDValue();
44821}
44822
44823static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
44824 TargetLowering::DAGCombinerInfo &DCI,
44825 const X86Subtarget &Subtarget) {
44826 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44828, __extension__ __PRETTY_FUNCTION__))
44827 X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44828, __extension__ __PRETTY_FUNCTION__))
44828 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44828, __extension__ __PRETTY_FUNCTION__))
;
44829 EVT VT = N->getValueType(0);
44830 SDValue N0 = N->getOperand(0);
44831 SDValue N1 = N->getOperand(1);
44832
44833 // Shift zero -> zero.
44834 if (ISD::isBuildVectorAllZeros(N0.getNode()))
44835 return DAG.getConstant(0, SDLoc(N), VT);
44836
44837 // Detect constant shift amounts.
44838 APInt UndefElts;
44839 SmallVector<APInt, 32> EltBits;
44840 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
44841 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
44842 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
44843 EltBits[0].getZExtValue(), DAG);
44844 }
44845
44846 APInt KnownUndef, KnownZero;
44847 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44848 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
44849 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
44850 KnownZero, DCI))
44851 return SDValue(N, 0);
44852
44853 return SDValue();
44854}
44855
44856static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
44857 TargetLowering::DAGCombinerInfo &DCI,
44858 const X86Subtarget &Subtarget) {
44859 unsigned Opcode = N->getOpcode();
44860 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44862, __extension__ __PRETTY_FUNCTION__))
44861 X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44862, __extension__ __PRETTY_FUNCTION__))
44862 "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44862, __extension__ __PRETTY_FUNCTION__))
;
44863 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
44864 EVT VT = N->getValueType(0);
44865 SDValue N0 = N->getOperand(0);
44866 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44867 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44868, __extension__ __PRETTY_FUNCTION__))
44868 "Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44868, __extension__ __PRETTY_FUNCTION__))
;
44869 assert(N->getOperand(1).getValueType() == MVT::i8 &&(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44870, __extension__ __PRETTY_FUNCTION__))
44870 "Unexpected shift amount type")(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44870, __extension__ __PRETTY_FUNCTION__))
;
44871
44872 // (shift undef, X) -> 0
44873 if (N0.isUndef())
44874 return DAG.getConstant(0, SDLoc(N), VT);
44875
44876 // Out of range logical bit shifts are guaranteed to be zero.
44877 // Out of range arithmetic bit shifts splat the sign bit.
44878 unsigned ShiftVal = N->getConstantOperandVal(1);
44879 if (ShiftVal >= NumBitsPerElt) {
44880 if (LogicalShift)
44881 return DAG.getConstant(0, SDLoc(N), VT);
44882 ShiftVal = NumBitsPerElt - 1;
44883 }
44884
44885 // (shift X, 0) -> X
44886 if (!ShiftVal)
44887 return N0;
44888
44889 // (shift 0, C) -> 0
44890 if (ISD::isBuildVectorAllZeros(N0.getNode()))
44891 // N0 is all zeros or undef. We guarantee that the bits shifted into the
44892 // result are all zeros, not undef.
44893 return DAG.getConstant(0, SDLoc(N), VT);
44894
44895 // (VSRAI -1, C) -> -1
44896 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
44897 // N0 is all ones or undef. We guarantee that the bits shifted into the
44898 // result are all ones, not undef.
44899 return DAG.getConstant(-1, SDLoc(N), VT);
44900
44901 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
44902 if (Opcode == N0.getOpcode()) {
44903 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
44904 unsigned NewShiftVal = ShiftVal + ShiftVal2;
44905 if (NewShiftVal >= NumBitsPerElt) {
44906 // Out of range logical bit shifts are guaranteed to be zero.
44907 // Out of range arithmetic bit shifts splat the sign bit.
44908 if (LogicalShift)
44909 return DAG.getConstant(0, SDLoc(N), VT);
44910 NewShiftVal = NumBitsPerElt - 1;
44911 }
44912 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
44913 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
44914 }
44915
44916 // We can decode 'whole byte' logical bit shifts as shuffles.
44917 if (LogicalShift && (ShiftVal % 8) == 0) {
44918 SDValue Op(N, 0);
44919 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44920 return Res;
44921 }
44922
44923 // Constant Folding.
44924 APInt UndefElts;
44925 SmallVector<APInt, 32> EltBits;
44926 if (N->isOnlyUserOf(N0.getNode()) &&
44927 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
44928 assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44929, __extension__ __PRETTY_FUNCTION__))
44929 "Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44929, __extension__ __PRETTY_FUNCTION__))
;
44930 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
44931 // created an undef input due to no input bits being demanded, but user
44932 // still expects 0 in other bits.
44933 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
44934 APInt &Elt = EltBits[i];
44935 if (UndefElts[i])
44936 Elt = 0;
44937 else if (X86ISD::VSHLI == Opcode)
44938 Elt <<= ShiftVal;
44939 else if (X86ISD::VSRAI == Opcode)
44940 Elt.ashrInPlace(ShiftVal);
44941 else
44942 Elt.lshrInPlace(ShiftVal);
44943 }
44944 // Reset undef elements since they were zeroed above.
44945 UndefElts = 0;
44946 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
44947 }
44948
44949 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44950 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44951 APInt::getAllOnesValue(NumBitsPerElt), DCI))
44952 return SDValue(N, 0);
44953
44954 return SDValue();
44955}
44956
44957static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
44958 TargetLowering::DAGCombinerInfo &DCI,
44959 const X86Subtarget &Subtarget) {
44960 EVT VT = N->getValueType(0);
44961 assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44964, __extension__ __PRETTY_FUNCTION__))
44962 (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44964, __extension__ __PRETTY_FUNCTION__))
44963 N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44964, __extension__ __PRETTY_FUNCTION__))
44964 "Unexpected vector insertion")(static_cast <bool> (((N->getOpcode() == X86ISD::PINSRB
&& VT == MVT::v16i8) || (N->getOpcode() == X86ISD
::PINSRW && VT == MVT::v8i16) || N->getOpcode() ==
ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"
) ? void (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || N->getOpcode() == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44964, __extension__ __PRETTY_FUNCTION__))
;
44965
44966 if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
44967 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
44968 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44969 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
44970 APInt::getAllOnesValue(NumBitsPerElt), DCI))
44971 return SDValue(N, 0);
44972 }
44973
44974 // Attempt to combine insertion patterns to a shuffle.
44975 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
44976 SDValue Op(N, 0);
44977 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
44978 return Res;
44979 }
44980
44981 return SDValue();
44982}
44983
44984/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
44985/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
44986/// OR -> CMPNEQSS.
44987static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
44988 TargetLowering::DAGCombinerInfo &DCI,
44989 const X86Subtarget &Subtarget) {
44990 unsigned opcode;
44991
44992 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
44993 // we're requiring SSE2 for both.
44994 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
44995 SDValue N0 = N->getOperand(0);
44996 SDValue N1 = N->getOperand(1);
44997 SDValue CMP0 = N0.getOperand(1);
44998 SDValue CMP1 = N1.getOperand(1);
44999 SDLoc DL(N);
45000
45001 // The SETCCs should both refer to the same CMP.
45002 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
45003 return SDValue();
45004
45005 SDValue CMP00 = CMP0->getOperand(0);
45006 SDValue CMP01 = CMP0->getOperand(1);
45007 EVT VT = CMP00.getValueType();
45008
45009 if (VT == MVT::f32 || VT == MVT::f64 ||
45010 (VT == MVT::f16 && Subtarget.hasFP16())) {
45011 bool ExpectingFlags = false;
45012 // Check for any users that want flags:
45013 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
45014 !ExpectingFlags && UI != UE; ++UI)
45015 switch (UI->getOpcode()) {
45016 default:
45017 case ISD::BR_CC:
45018 case ISD::BRCOND:
45019 case ISD::SELECT:
45020 ExpectingFlags = true;
45021 break;
45022 case ISD::CopyToReg:
45023 case ISD::SIGN_EXTEND:
45024 case ISD::ZERO_EXTEND:
45025 case ISD::ANY_EXTEND:
45026 break;
45027 }
45028
45029 if (!ExpectingFlags) {
45030 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
45031 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
45032
45033 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
45034 X86::CondCode tmp = cc0;
45035 cc0 = cc1;
45036 cc1 = tmp;
45037 }
45038
45039 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
45040 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
45041 // FIXME: need symbolic constants for these magic numbers.
45042 // See X86ATTInstPrinter.cpp:printSSECC().
45043 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
45044 if (Subtarget.hasAVX512()) {
45045 SDValue FSetCC =
45046 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
45047 DAG.getTargetConstant(x86cc, DL, MVT::i8));
45048 // Need to fill with zeros to ensure the bitcast will produce zeroes
45049 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
45050 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
45051 DAG.getConstant(0, DL, MVT::v16i1),
45052 FSetCC, DAG.getIntPtrConstant(0, DL));
45053 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
45054 N->getSimpleValueType(0));
45055 }
45056 SDValue OnesOrZeroesF =
45057 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
45058 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
45059
45060 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
45061 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
45062
45063 if (is64BitFP && !Subtarget.is64Bit()) {
45064 // On a 32-bit target, we cannot bitcast the 64-bit float to a
45065 // 64-bit integer, since that's not a legal type. Since
45066 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
45067 // bits, but can do this little dance to extract the lowest 32 bits
45068 // and work with those going forward.
45069 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
45070 OnesOrZeroesF);
45071 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
45072 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
45073 Vector32, DAG.getIntPtrConstant(0, DL));
45074 IntVT = MVT::i32;
45075 }
45076
45077 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
45078 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
45079 DAG.getConstant(1, DL, IntVT));
45080 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
45081 ANDed);
45082 return OneBitOfTruth;
45083 }
45084 }
45085 }
45086 }
45087 return SDValue();
45088}
45089
45090/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
45091static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
45092 assert(N->getOpcode() == ISD::AND)(static_cast <bool> (N->getOpcode() == ISD::AND) ? void
(0) : __assert_fail ("N->getOpcode() == ISD::AND", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45092, __extension__ __PRETTY_FUNCTION__))
;
45093
45094 MVT VT = N->getSimpleValueType(0);
45095 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
45096 return SDValue();
45097
45098 SDValue X, Y;
45099 SDValue N0 = N->getOperand(0);
45100 SDValue N1 = N->getOperand(1);
45101
45102 auto GetNot = [&VT, &DAG](SDValue V) {
45103 // Basic X = NOT(Y) detection.
45104 if (SDValue Not = IsNOT(V, DAG))
45105 return Not;
45106 // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
45107 if (V.getOpcode() == X86ISD::VBROADCAST) {
45108 SDValue Src = V.getOperand(0);
45109 EVT SrcVT = Src.getValueType();
45110 if (!SrcVT.isVector())
45111 return SDValue();
45112 if (SDValue Not = IsNOT(Src, DAG))
45113 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
45114 DAG.getBitcast(SrcVT, Not));
45115 }
45116 return SDValue();
45117 };
45118
45119 if (SDValue Not = GetNot(N0)) {
45120 X = Not;
45121 Y = N1;
45122 } else if (SDValue Not = GetNot(N1)) {
45123 X = Not;
45124 Y = N0;
45125 } else
45126 return SDValue();
45127
45128 X = DAG.getBitcast(VT, X);
45129 Y = DAG.getBitcast(VT, Y);
45130 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
45131}
45132
45133// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
45134// logical operations, like in the example below.
45135// or (and (truncate x, truncate y)),
45136// (xor (truncate z, build_vector (constants)))
45137// Given a target type \p VT, we generate
45138// or (and x, y), (xor z, zext(build_vector (constants)))
45139// given x, y and z are of type \p VT. We can do so, if operands are either
45140// truncates from VT types, the second operand is a vector of constants or can
45141// be recursively promoted.
45142static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
45143 unsigned Depth) {
45144 // Limit recursion to avoid excessive compile times.
45145 if (Depth >= SelectionDAG::MaxRecursionDepth)
45146 return SDValue();
45147
45148 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
45149 N->getOpcode() != ISD::OR)
45150 return SDValue();
45151
45152 SDValue N0 = N->getOperand(0);
45153 SDValue N1 = N->getOperand(1);
45154 SDLoc DL(N);
45155
45156 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45157 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
45158 return SDValue();
45159
45160 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
45161 N0 = NN0;
45162 else {
45163 // The Left side has to be a trunc.
45164 if (N0.getOpcode() != ISD::TRUNCATE)
45165 return SDValue();
45166
45167 // The type of the truncated inputs.
45168 if (N0.getOperand(0).getValueType() != VT)
45169 return SDValue();
45170
45171 N0 = N0.getOperand(0);
45172 }
45173
45174 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
45175 N1 = NN1;
45176 else {
45177 // The right side has to be a 'trunc' or a constant vector.
45178 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
45179 N1.getOperand(0).getValueType() == VT;
45180 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
45181 return SDValue();
45182
45183 if (RHSTrunc)
45184 N1 = N1.getOperand(0);
45185 else
45186 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
45187 }
45188
45189 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
45190}
45191
45192// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
45193// register. In most cases we actually compare or select YMM-sized registers
45194// and mixing the two types creates horrible code. This method optimizes
45195// some of the transition sequences.
45196// Even with AVX-512 this is still useful for removing casts around logical
45197// operations on vXi1 mask types.
45198static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
45199 const X86Subtarget &Subtarget) {
45200 EVT VT = N->getValueType(0);
45201 assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45201, __extension__ __PRETTY_FUNCTION__))
;
45202
45203 SDLoc DL(N);
45204 assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45206, __extension__ __PRETTY_FUNCTION__))
45205 N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45206, __extension__ __PRETTY_FUNCTION__))
45206 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45206, __extension__ __PRETTY_FUNCTION__))
;
45207
45208 SDValue Narrow = N->getOperand(0);
45209 EVT NarrowVT = Narrow.getValueType();
45210
45211 // Generate the wide operation.
45212 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
45213 if (!Op)
45214 return SDValue();
45215 switch (N->getOpcode()) {
45216 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45216)
;
45217 case ISD::ANY_EXTEND:
45218 return Op;
45219 case ISD::ZERO_EXTEND:
45220 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
45221 case ISD::SIGN_EXTEND:
45222 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
45223 Op, DAG.getValueType(NarrowVT));
45224 }
45225}
45226
45227static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
45228 unsigned FPOpcode;
45229 switch (Opcode) {
45230 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45230)
;
45231 case ISD::AND: FPOpcode = X86ISD::FAND; break;
45232 case ISD::OR: FPOpcode = X86ISD::FOR; break;
45233 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
45234 }
45235 return FPOpcode;
45236}
45237
45238/// If both input operands of a logic op are being cast from floating point
45239/// types, try to convert this into a floating point logic node to avoid
45240/// unnecessary moves from SSE to integer registers.
45241static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
45242 const X86Subtarget &Subtarget) {
45243 EVT VT = N->getValueType(0);
45244 SDValue N0 = N->getOperand(0);
45245 SDValue N1 = N->getOperand(1);
45246 SDLoc DL(N);
45247
45248 if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
45249 return SDValue();
45250
45251 SDValue N00 = N0.getOperand(0);
45252 SDValue N10 = N1.getOperand(0);
45253 EVT N00Type = N00.getValueType();
45254 EVT N10Type = N10.getValueType();
45255
45256 // Ensure that both types are the same and are legal scalar fp types.
45257 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
45258 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
45259 (Subtarget.hasFP16() && N00Type == MVT::f16)))
45260 return SDValue();
45261
45262 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
45263 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
45264 return DAG.getBitcast(VT, FPLogic);
45265}
45266
45267// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
45268// to reduce XMM->GPR traffic.
45269static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
45270 unsigned Opc = N->getOpcode();
45271 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45272, __extension__ __PRETTY_FUNCTION__))
45272 "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45272, __extension__ __PRETTY_FUNCTION__))
;
45273
45274 SDValue N0 = N->getOperand(0);
45275 SDValue N1 = N->getOperand(1);
45276
45277 // Both operands must be single use MOVMSK.
45278 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
45279 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
45280 return SDValue();
45281
45282 SDValue Vec0 = N0.getOperand(0);
45283 SDValue Vec1 = N1.getOperand(0);
45284 EVT VecVT0 = Vec0.getValueType();
45285 EVT VecVT1 = Vec1.getValueType();
45286
45287 // Both MOVMSK operands must be from vectors of the same size and same element
45288 // size, but its OK for a fp/int diff.
45289 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
45290 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
45291 return SDValue();
45292
45293 SDLoc DL(N);
45294 unsigned VecOpc =
45295 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
45296 SDValue Result =
45297 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
45298 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
45299}
45300
45301/// If this is a zero/all-bits result that is bitwise-anded with a low bits
45302/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
45303/// with a shift-right to eliminate loading the vector constant mask value.
45304static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
45305 const X86Subtarget &Subtarget) {
45306 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
45307 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
45308 EVT VT0 = Op0.getValueType();
45309 EVT VT1 = Op1.getValueType();
45310
45311 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
45312 return SDValue();
45313
45314 APInt SplatVal;
45315 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
45316 !SplatVal.isMask())
45317 return SDValue();
45318
45319 // Don't prevent creation of ANDN.
45320 if (isBitwiseNot(Op0))
45321 return SDValue();
45322
45323 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
45324 return SDValue();
45325
45326 unsigned EltBitWidth = VT0.getScalarSizeInBits();
45327 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
45328 return SDValue();
45329
45330 SDLoc DL(N);
45331 unsigned ShiftVal = SplatVal.countTrailingOnes();
45332 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
45333 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
45334 return DAG.getBitcast(N->getValueType(0), Shift);
45335}
45336
45337// Get the index node from the lowered DAG of a GEP IR instruction with one
45338// indexing dimension.
45339static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
45340 if (Ld->isIndexed())
45341 return SDValue();
45342
45343 SDValue Base = Ld->getBasePtr();
45344
45345 if (Base.getOpcode() != ISD::ADD)
45346 return SDValue();
45347
45348 SDValue ShiftedIndex = Base.getOperand(0);
45349
45350 if (ShiftedIndex.getOpcode() != ISD::SHL)
45351 return SDValue();
45352
45353 return ShiftedIndex.getOperand(0);
45354
45355}
45356
45357static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
45358 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
45359 switch (VT.getSizeInBits()) {
45360 default: return false;
45361 case 64: return Subtarget.is64Bit() ? true : false;
45362 case 32: return true;
45363 }
45364 }
45365 return false;
45366}
45367
45368// This function recognizes cases where X86 bzhi instruction can replace and
45369// 'and-load' sequence.
45370// In case of loading integer value from an array of constants which is defined
45371// as follows:
45372//
45373// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
45374//
45375// then applying a bitwise and on the result with another input.
45376// It's equivalent to performing bzhi (zero high bits) on the input, with the
45377// same index of the load.
45378static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
45379 const X86Subtarget &Subtarget) {
45380 MVT VT = Node->getSimpleValueType(0);
45381 SDLoc dl(Node);
45382
45383 // Check if subtarget has BZHI instruction for the node's type
45384 if (!hasBZHI(Subtarget, VT))
45385 return SDValue();
45386
45387 // Try matching the pattern for both operands.
45388 for (unsigned i = 0; i < 2; i++) {
45389 SDValue N = Node->getOperand(i);
45390 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
45391
45392 // continue if the operand is not a load instruction
45393 if (!Ld)
45394 return SDValue();
45395
45396 const Value *MemOp = Ld->getMemOperand()->getValue();
45397
45398 if (!MemOp)
45399 return SDValue();
45400
45401 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
45402 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
45403 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
45404
45405 Constant *Init = GV->getInitializer();
45406 Type *Ty = Init->getType();
45407 if (!isa<ConstantDataArray>(Init) ||
45408 !Ty->getArrayElementType()->isIntegerTy() ||
45409 Ty->getArrayElementType()->getScalarSizeInBits() !=
45410 VT.getSizeInBits() ||
45411 Ty->getArrayNumElements() >
45412 Ty->getArrayElementType()->getScalarSizeInBits())
45413 continue;
45414
45415 // Check if the array's constant elements are suitable to our case.
45416 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
45417 bool ConstantsMatch = true;
45418 for (uint64_t j = 0; j < ArrayElementCount; j++) {
45419 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
45420 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
45421 ConstantsMatch = false;
45422 break;
45423 }
45424 }
45425 if (!ConstantsMatch)
45426 continue;
45427
45428 // Do the transformation (For 32-bit type):
45429 // -> (and (load arr[idx]), inp)
45430 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
45431 // that will be replaced with one bzhi instruction.
45432 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
45433 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
45434
45435 // Get the Node which indexes into the array.
45436 SDValue Index = getIndexFromUnindexedLoad(Ld);
45437 if (!Index)
45438 return SDValue();
45439 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
45440
45441 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
45442 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
45443
45444 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
45445 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
45446
45447 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
45448 }
45449 }
45450 }
45451 }
45452 return SDValue();
45453}
45454
45455// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
45456// Where C is a mask containing the same number of bits as the setcc and
45457// where the setcc will freely 0 upper bits of k-register. We can replace the
45458// undef in the concat with 0s and remove the AND. This mainly helps with
45459// v2i1/v4i1 setcc being casted to scalar.
45460static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
45461 const X86Subtarget &Subtarget) {
45462 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45462, __extension__ __PRETTY_FUNCTION__))
;
45463
45464 EVT VT = N->getValueType(0);
45465
45466 // Make sure this is an AND with constant. We will check the value of the
45467 // constant later.
45468 if (!isa<ConstantSDNode>(N->getOperand(1)))
45469 return SDValue();
45470
45471 // This is implied by the ConstantSDNode.
45472 assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45472, __extension__ __PRETTY_FUNCTION__))
;
45473
45474 if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
45475 !N->getOperand(0).hasOneUse() ||
45476 !N->getOperand(0).getOperand(0).hasOneUse())
45477 return SDValue();
45478
45479 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45480 SDValue Src = N->getOperand(0).getOperand(0);
45481 EVT SrcVT = Src.getValueType();
45482 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
45483 !TLI.isTypeLegal(SrcVT))
45484 return SDValue();
45485
45486 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
45487 return SDValue();
45488
45489 // We only care about the first subvector of the concat, we expect the
45490 // other subvectors to be ignored due to the AND if we make the change.
45491 SDValue SubVec = Src.getOperand(0);
45492 EVT SubVecVT = SubVec.getValueType();
45493
45494 // First subvector should be a setcc with a legal result type. The RHS of the
45495 // AND should be a mask with this many bits.
45496 if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
45497 !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
45498 return SDValue();
45499
45500 EVT SetccVT = SubVec.getOperand(0).getValueType();
45501 if (!TLI.isTypeLegal(SetccVT) ||
45502 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
45503 return SDValue();
45504
45505 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
45506 return SDValue();
45507
45508 // We passed all the checks. Rebuild the concat_vectors with zeroes
45509 // and cast it back to VT.
45510 SDLoc dl(N);
45511 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
45512 DAG.getConstant(0, dl, SubVecVT));
45513 Ops[0] = SubVec;
45514 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
45515 Ops);
45516 return DAG.getBitcast(VT, Concat);
45517}
45518
45519static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
45520 TargetLowering::DAGCombinerInfo &DCI,
45521 const X86Subtarget &Subtarget) {
45522 EVT VT = N->getValueType(0);
45523
45524 // If this is SSE1 only convert to FAND to avoid scalarization.
45525 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
45526 return DAG.getBitcast(
45527 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
45528 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
45529 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
45530 }
45531
45532 // Use a 32-bit and+zext if upper bits known zero.
45533 if (VT == MVT::i64 && Subtarget.is64Bit() &&
45534 !isa<ConstantSDNode>(N->getOperand(1))) {
45535 APInt HiMask = APInt::getHighBitsSet(64, 32);
45536 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
45537 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
45538 SDLoc dl(N);
45539 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
45540 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
45541 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
45542 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
45543 }
45544 }
45545
45546 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
45547 // TODO: Support multiple SrcOps.
45548 if (VT == MVT::i1) {
45549 SmallVector<SDValue, 2> SrcOps;
45550 SmallVector<APInt, 2> SrcPartials;
45551 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
45552 SrcOps.size() == 1) {
45553 SDLoc dl(N);
45554 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45555 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
45556 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45557 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
45558 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
45559 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
45560 if (Mask) {
45561 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45562, __extension__ __PRETTY_FUNCTION__))
45562 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45562, __extension__ __PRETTY_FUNCTION__))
;
45563 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
45564 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
45565 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
45566 }
45567 }
45568 }
45569
45570 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
45571 return V;
45572
45573 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
45574 return R;
45575
45576 if (DCI.isBeforeLegalizeOps())
45577 return SDValue();
45578
45579 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
45580 return R;
45581
45582 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
45583 return FPLogic;
45584
45585 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
45586 return R;
45587
45588 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
45589 return ShiftRight;
45590
45591 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
45592 return R;
45593
45594 // Attempt to recursively combine a bitmask AND with shuffles.
45595 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
45596 SDValue Op(N, 0);
45597 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
45598 return Res;
45599 }
45600
45601 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
45602 if ((VT.getScalarSizeInBits() % 8) == 0 &&
45603 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45604 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
45605 SDValue BitMask = N->getOperand(1);
45606 SDValue SrcVec = N->getOperand(0).getOperand(0);
45607 EVT SrcVecVT = SrcVec.getValueType();
45608
45609 // Check that the constant bitmask masks whole bytes.
45610 APInt UndefElts;
45611 SmallVector<APInt, 64> EltBits;
45612 if (VT == SrcVecVT.getScalarType() &&
45613 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
45614 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
45615 llvm::all_of(EltBits, [](const APInt &M) {
45616 return M.isNullValue() || M.isAllOnesValue();
45617 })) {
45618 unsigned NumElts = SrcVecVT.getVectorNumElements();
45619 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
45620 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
45621
45622 // Create a root shuffle mask from the byte mask and the extracted index.
45623 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
45624 for (unsigned i = 0; i != Scale; ++i) {
45625 if (UndefElts[i])
45626 continue;
45627 int VecIdx = Scale * Idx + i;
45628 ShuffleMask[VecIdx] =
45629 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
45630 }
45631
45632 if (SDValue Shuffle = combineX86ShufflesRecursively(
45633 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
45634 X86::MaxShuffleCombineDepth,
45635 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
45636 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
45637 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
45638 N->getOperand(0).getOperand(1));
45639 }
45640 }
45641
45642 return SDValue();
45643}
45644
45645// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
45646static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
45647 const X86Subtarget &Subtarget) {
45648 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45648, __extension__ __PRETTY_FUNCTION__))
;
45649
45650 MVT VT = N->getSimpleValueType(0);
45651 if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
45652 return SDValue();
45653
45654 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
45655 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
45656 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
45657 return SDValue();
45658
45659 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
45660 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
45661 bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
45662 Subtarget.hasVLX();
45663 if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
45664 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
45665 return SDValue();
45666
45667 // Attempt to extract constant byte masks.
45668 APInt UndefElts0, UndefElts1;
45669 SmallVector<APInt, 32> EltBits0, EltBits1;
45670 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
45671 false, false))
45672 return SDValue();
45673 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
45674 false, false))
45675 return SDValue();
45676
45677 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
45678 // TODO - add UNDEF elts support.
45679 if (UndefElts0[i] || UndefElts1[i])
45680 return SDValue();
45681 if (EltBits0[i] != ~EltBits1[i])
45682 return SDValue();
45683 }
45684
45685 SDLoc DL(N);
45686
45687 if (UseVPTERNLOG) {
45688 // Emit a VPTERNLOG node directly.
45689 SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
45690 SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
45691 SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
45692 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
45693 return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
45694 }
45695
45696 SDValue X = N->getOperand(0);
45697 SDValue Y =
45698 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
45699 DAG.getBitcast(VT, N1.getOperand(0)));
45700 return DAG.getNode(ISD::OR, DL, VT, X, Y);
45701}
45702
45703// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
45704static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
45705 if (N->getOpcode() != ISD::OR)
45706 return false;
45707
45708 SDValue N0 = N->getOperand(0);
45709 SDValue N1 = N->getOperand(1);
45710
45711 // Canonicalize AND to LHS.
45712 if (N1.getOpcode() == ISD::AND)
45713 std::swap(N0, N1);
45714
45715 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
45716 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
45717 return false;
45718
45719 Mask = N1.getOperand(0);
45720 X = N1.getOperand(1);
45721
45722 // Check to see if the mask appeared in both the AND and ANDNP.
45723 if (N0.getOperand(0) == Mask)
45724 Y = N0.getOperand(1);
45725 else if (N0.getOperand(1) == Mask)
45726 Y = N0.getOperand(0);
45727 else
45728 return false;
45729
45730 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
45731 // ANDNP combine allows other combines to happen that prevent matching.
45732 return true;
45733}
45734
45735// Try to fold:
45736// (or (and (m, y), (pandn m, x)))
45737// into:
45738// (vselect m, x, y)
45739// As a special case, try to fold:
45740// (or (and (m, (sub 0, x)), (pandn m, x)))
45741// into:
45742// (sub (xor X, M), M)
45743static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
45744 const X86Subtarget &Subtarget) {
45745 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45745, __extension__ __PRETTY_FUNCTION__))
;
45746
45747 EVT VT = N->getValueType(0);
45748 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
45749 (VT.is256BitVector() && Subtarget.hasInt256())))
45750 return SDValue();
45751
45752 SDValue X, Y, Mask;
45753 if (!matchLogicBlend(N, X, Y, Mask))
45754 return SDValue();
45755
45756 // Validate that X, Y, and Mask are bitcasts, and see through them.
45757 Mask = peekThroughBitcasts(Mask);
45758 X = peekThroughBitcasts(X);
45759 Y = peekThroughBitcasts(Y);
45760
45761 EVT MaskVT = Mask.getValueType();
45762 unsigned EltBits = MaskVT.getScalarSizeInBits();
45763
45764 // TODO: Attempt to handle floating point cases as well?
45765 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
45766 return SDValue();
45767
45768 SDLoc DL(N);
45769
45770 // Attempt to combine to conditional negate: (sub (xor X, M), M)
45771 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
45772 DAG, Subtarget))
45773 return Res;
45774
45775 // PBLENDVB is only available on SSE 4.1.
45776 if (!Subtarget.hasSSE41())
45777 return SDValue();
45778
45779 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
45780 if (Subtarget.hasVLX())
45781 return SDValue();
45782
45783 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
45784
45785 X = DAG.getBitcast(BlendVT, X);
45786 Y = DAG.getBitcast(BlendVT, Y);
45787 Mask = DAG.getBitcast(BlendVT, Mask);
45788 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
45789 return DAG.getBitcast(VT, Mask);
45790}
45791
45792// Helper function for combineOrCmpEqZeroToCtlzSrl
45793// Transforms:
45794// seteq(cmp x, 0)
45795// into:
45796// srl(ctlz x), log2(bitsize(x))
45797// Input pattern is checked by caller.
45798static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
45799 SelectionDAG &DAG) {
45800 SDValue Cmp = Op.getOperand(1);
45801 EVT VT = Cmp.getOperand(0).getValueType();
45802 unsigned Log2b = Log2_32(VT.getSizeInBits());
45803 SDLoc dl(Op);
45804 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
45805 // The result of the shift is true or false, and on X86, the 32-bit
45806 // encoding of shr and lzcnt is more desirable.
45807 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
45808 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
45809 DAG.getConstant(Log2b, dl, MVT::i8));
45810 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
45811}
45812
45813// Try to transform:
45814// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
45815// into:
45816// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
45817// Will also attempt to match more generic cases, eg:
45818// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
45819// Only applies if the target supports the FastLZCNT feature.
45820static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
45821 TargetLowering::DAGCombinerInfo &DCI,
45822 const X86Subtarget &Subtarget) {
45823 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
45824 return SDValue();
45825
45826 auto isORCandidate = [](SDValue N) {
45827 return (N->getOpcode() == ISD::OR && N->hasOneUse());
45828 };
45829
45830 // Check the zero extend is extending to 32-bit or more. The code generated by
45831 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
45832 // instructions to clear the upper bits.
45833 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
45834 !isORCandidate(N->getOperand(0)))
45835 return SDValue();
45836
45837 // Check the node matches: setcc(eq, cmp 0)
45838 auto isSetCCCandidate = [](SDValue N) {
45839 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
45840 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
45841 N->getOperand(1).getOpcode() == X86ISD::CMP &&
45842 isNullConstant(N->getOperand(1).getOperand(1)) &&
45843 N->getOperand(1).getValueType().bitsGE(MVT::i32);
45844 };
45845
45846 SDNode *OR = N->getOperand(0).getNode();
45847 SDValue LHS = OR->getOperand(0);
45848 SDValue RHS = OR->getOperand(1);
45849
45850 // Save nodes matching or(or, setcc(eq, cmp 0)).
45851 SmallVector<SDNode *, 2> ORNodes;
45852 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
45853 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
45854 ORNodes.push_back(OR);
45855 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
45856 LHS = OR->getOperand(0);
45857 RHS = OR->getOperand(1);
45858 }
45859
45860 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
45861 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
45862 !isORCandidate(SDValue(OR, 0)))
45863 return SDValue();
45864
45865 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
45866 // to
45867 // or(srl(ctlz),srl(ctlz)).
45868 // The dag combiner can then fold it into:
45869 // srl(or(ctlz, ctlz)).
45870 EVT VT = OR->getValueType(0);
45871 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
45872 SDValue Ret, NewRHS;
45873 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
45874 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
45875
45876 if (!Ret)
45877 return SDValue();
45878
45879 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
45880 while (ORNodes.size() > 0) {
45881 OR = ORNodes.pop_back_val();
45882 LHS = OR->getOperand(0);
45883 RHS = OR->getOperand(1);
45884 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
45885 if (RHS->getOpcode() == ISD::OR)
45886 std::swap(LHS, RHS);
45887 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
45888 if (!NewRHS)
45889 return SDValue();
45890 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
45891 }
45892
45893 if (Ret)
45894 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
45895
45896 return Ret;
45897}
45898
45899static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
45900 TargetLowering::DAGCombinerInfo &DCI,
45901 const X86Subtarget &Subtarget) {
45902 SDValue N0 = N->getOperand(0);
45903 SDValue N1 = N->getOperand(1);
45904 EVT VT = N->getValueType(0);
45905
45906 // If this is SSE1 only convert to FOR to avoid scalarization.
45907 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
45908 return DAG.getBitcast(MVT::v4i32,
45909 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
45910 DAG.getBitcast(MVT::v4f32, N0),
45911 DAG.getBitcast(MVT::v4f32, N1)));
45912 }
45913
45914 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
45915 // TODO: Support multiple SrcOps.
45916 if (VT == MVT::i1) {
45917 SmallVector<SDValue, 2> SrcOps;
45918 SmallVector<APInt, 2> SrcPartials;
45919 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
45920 SrcOps.size() == 1) {
45921 SDLoc dl(N);
45922 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45923 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
45924 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
45925 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
45926 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
45927 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
45928 if (Mask) {
45929 assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45930, __extension__ __PRETTY_FUNCTION__))
45930 "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45930, __extension__ __PRETTY_FUNCTION__))
;
45931 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
45932 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
45933 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
45934 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
45935 }
45936 }
45937 }
45938
45939 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
45940 return R;
45941
45942 if (DCI.isBeforeLegalizeOps())
45943 return SDValue();
45944
45945 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
45946 return R;
45947
45948 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
45949 return FPLogic;
45950
45951 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
45952 return R;
45953
45954 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
45955 return R;
45956
45957 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
45958 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
45959 // iff the upper elements of the non-shifted arg are zero.
45960 // KUNPCK require 16+ bool vector elements.
45961 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
45962 unsigned NumElts = VT.getVectorNumElements();
45963 unsigned HalfElts = NumElts / 2;
45964 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
45965 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
45966 N1.getConstantOperandAPInt(1) == HalfElts &&
45967 DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
45968 SDLoc dl(N);
45969 return DAG.getNode(
45970 ISD::CONCAT_VECTORS, dl, VT,
45971 extractSubVector(N0, 0, DAG, dl, HalfElts),
45972 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
45973 }
45974 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
45975 N0.getConstantOperandAPInt(1) == HalfElts &&
45976 DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
45977 SDLoc dl(N);
45978 return DAG.getNode(
45979 ISD::CONCAT_VECTORS, dl, VT,
45980 extractSubVector(N1, 0, DAG, dl, HalfElts),
45981 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
45982 }
45983 }
45984
45985 // Attempt to recursively combine an OR of shuffles.
45986 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
45987 SDValue Op(N, 0);
45988 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
45989 return Res;
45990 }
45991
45992 return SDValue();
45993}
45994
45995/// Try to turn tests against the signbit in the form of:
45996/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
45997/// into:
45998/// SETGT(X, -1)
45999static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
46000 // This is only worth doing if the output type is i8 or i1.
46001 EVT ResultType = N->getValueType(0);
46002 if (ResultType != MVT::i8 && ResultType != MVT::i1)
46003 return SDValue();
46004
46005 SDValue N0 = N->getOperand(0);
46006 SDValue N1 = N->getOperand(1);
46007
46008 // We should be performing an xor against a truncated shift.
46009 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
46010 return SDValue();
46011
46012 // Make sure we are performing an xor against one.
46013 if (!isOneConstant(N1))
46014 return SDValue();
46015
46016 // SetCC on x86 zero extends so only act on this if it's a logical shift.
46017 SDValue Shift = N0.getOperand(0);
46018 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
46019 return SDValue();
46020
46021 // Make sure we are truncating from one of i16, i32 or i64.
46022 EVT ShiftTy = Shift.getValueType();
46023 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
46024 return SDValue();
46025
46026 // Make sure the shift amount extracts the sign bit.
46027 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
46028 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
46029 return SDValue();
46030
46031 // Create a greater-than comparison against -1.
46032 // N.B. Using SETGE against 0 works but we want a canonical looking
46033 // comparison, using SETGT matches up with what TranslateX86CC.
46034 SDLoc DL(N);
46035 SDValue ShiftOp = Shift.getOperand(0);
46036 EVT ShiftOpTy = ShiftOp.getValueType();
46037 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46038 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
46039 *DAG.getContext(), ResultType);
46040 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
46041 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
46042 if (SetCCResultType != ResultType)
46043 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
46044 return Cond;
46045}
46046
46047/// Turn vector tests of the signbit in the form of:
46048/// xor (sra X, elt_size(X)-1), -1
46049/// into:
46050/// pcmpgt X, -1
46051///
46052/// This should be called before type legalization because the pattern may not
46053/// persist after that.
46054static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
46055 const X86Subtarget &Subtarget) {
46056 EVT VT = N->getValueType(0);
46057 if (!VT.isSimple())
46058 return SDValue();
46059
46060 switch (VT.getSimpleVT().SimpleTy) {
46061 default: return SDValue();
46062 case MVT::v16i8:
46063 case MVT::v8i16:
46064 case MVT::v4i32:
46065 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
46066 case MVT::v32i8:
46067 case MVT::v16i16:
46068 case MVT::v8i32:
46069 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
46070 }
46071
46072 // There must be a shift right algebraic before the xor, and the xor must be a
46073 // 'not' operation.
46074 SDValue Shift = N->getOperand(0);
46075 SDValue Ones = N->getOperand(1);
46076 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
46077 !ISD::isBuildVectorAllOnes(Ones.getNode()))
46078 return SDValue();
46079
46080 // The shift should be smearing the sign bit across each vector element.
46081 auto *ShiftAmt =
46082 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
46083 if (!ShiftAmt ||
46084 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
46085 return SDValue();
46086
46087 // Create a greater-than comparison against -1. We don't use the more obvious
46088 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
46089 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
46090}
46091
46092/// Detect patterns of truncation with unsigned saturation:
46093///
46094/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
46095/// Return the source value x to be truncated or SDValue() if the pattern was
46096/// not matched.
46097///
46098/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
46099/// where C1 >= 0 and C2 is unsigned max of destination type.
46100///
46101/// (truncate (smax (smin (x, C2), C1)) to dest_type)
46102/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
46103///
46104/// These two patterns are equivalent to:
46105/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
46106/// So return the smax(x, C1) value to be truncated or SDValue() if the
46107/// pattern was not matched.
46108static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
46109 const SDLoc &DL) {
46110 EVT InVT = In.getValueType();
46111
46112 // Saturation with truncation. We truncate from InVT to VT.
46113 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46114, __extension__ __PRETTY_FUNCTION__))
46114 "Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46114, __extension__ __PRETTY_FUNCTION__))
;
46115
46116 // Match min/max and return limit value as a parameter.
46117 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
46118 if (V.getOpcode() == Opcode &&
46119 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
46120 return V.getOperand(0);
46121 return SDValue();
46122 };
46123
46124 APInt C1, C2;
46125 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
46126 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
46127 // the element size of the destination type.
46128 if (C2.isMask(VT.getScalarSizeInBits()))
46129 return UMin;
46130
46131 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
46132 if (MatchMinMax(SMin, ISD::SMAX, C1))
46133 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
46134 return SMin;
46135
46136 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
46137 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
46138 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
46139 C2.uge(C1)) {
46140 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
46141 }
46142
46143 return SDValue();
46144}
46145
46146/// Detect patterns of truncation with signed saturation:
46147/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
46148/// signed_max_of_dest_type)) to dest_type)
46149/// or:
46150/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
46151/// signed_min_of_dest_type)) to dest_type).
46152/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
46153/// Return the source value to be truncated or SDValue() if the pattern was not
46154/// matched.
46155static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
46156 unsigned NumDstBits = VT.getScalarSizeInBits();
46157 unsigned NumSrcBits = In.getScalarValueSizeInBits();
46158 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46158, __extension__ __PRETTY_FUNCTION__))
;
46159
46160 auto MatchMinMax = [](SDValue V, unsigned Opcode,
46161 const APInt &Limit) -> SDValue {
46162 APInt C;
46163 if (V.getOpcode() == Opcode &&
46164 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
46165 return V.getOperand(0);
46166 return SDValue();
46167 };
46168
46169 APInt SignedMax, SignedMin;
46170 if (MatchPackUS) {
46171 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
46172 SignedMin = APInt(NumSrcBits, 0);
46173 } else {
46174 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
46175 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
46176 }
46177
46178 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
46179 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
46180 return SMax;
46181
46182 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
46183 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
46184 return SMin;
46185
46186 return SDValue();
46187}
46188
46189static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
46190 SelectionDAG &DAG,
46191 const X86Subtarget &Subtarget) {
46192 if (!Subtarget.hasSSE2() || !VT.isVector())
46193 return SDValue();
46194
46195 EVT SVT = VT.getVectorElementType();
46196 EVT InVT = In.getValueType();
46197 EVT InSVT = InVT.getVectorElementType();
46198
46199 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
46200 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
46201 // and concatenate at the same time. Then we can use a final vpmovuswb to
46202 // clip to 0-255.
46203 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
46204 InVT == MVT::v16i32 && VT == MVT::v16i8) {
46205 if (auto USatVal = detectSSatPattern(In, VT, true)) {
46206 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
46207 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
46208 DL, DAG, Subtarget);
46209 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46209, __extension__ __PRETTY_FUNCTION__))
;
46210 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
46211 }
46212 }
46213
46214 // vXi32 truncate instructions are available with AVX512F.
46215 // vXi16 truncate instructions are only available with AVX512BW.
46216 // For 256-bit or smaller vectors, we require VLX.
46217 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
46218 // If the result type is 256-bits or larger and we have disable 512-bit
46219 // registers, we should go ahead and use the pack instructions if possible.
46220 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
46221 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
46222 (InVT.getSizeInBits() > 128) &&
46223 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
46224 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
46225
46226 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
46227 VT.getSizeInBits() >= 64 &&
46228 (SVT == MVT::i8 || SVT == MVT::i16) &&
46229 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
46230 if (auto USatVal = detectSSatPattern(In, VT, true)) {
46231 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
46232 // Only do this when the result is at least 64 bits or we'll leaving
46233 // dangling PACKSSDW nodes.
46234 if (SVT == MVT::i8 && InSVT == MVT::i32) {
46235 EVT MidVT = VT.changeVectorElementType(MVT::i16);
46236 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
46237 DAG, Subtarget);
46238 assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46238, __extension__ __PRETTY_FUNCTION__))
;
46239 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
46240 Subtarget);
46241 assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46241, __extension__ __PRETTY_FUNCTION__))
;
46242 return V;
46243 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
46244 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
46245 Subtarget);
46246 }
46247 if (auto SSatVal = detectSSatPattern(In, VT))
46248 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
46249 Subtarget);
46250 }
46251
46252 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46253 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
46254 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
46255 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
46256 unsigned TruncOpc = 0;
46257 SDValue SatVal;
46258 if (auto SSatVal = detectSSatPattern(In, VT)) {
46259 SatVal = SSatVal;
46260 TruncOpc = X86ISD::VTRUNCS;
46261 } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
46262 SatVal = USatVal;
46263 TruncOpc = X86ISD::VTRUNCUS;
46264 }
46265 if (SatVal) {
46266 unsigned ResElts = VT.getVectorNumElements();
46267 // If the input type is less than 512 bits and we don't have VLX, we need
46268 // to widen to 512 bits.
46269 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
46270 unsigned NumConcats = 512 / InVT.getSizeInBits();
46271 ResElts *= NumConcats;
46272 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
46273 ConcatOps[0] = SatVal;
46274 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
46275 NumConcats * InVT.getVectorNumElements());
46276 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
46277 }
46278 // Widen the result if its narrower than 128 bits.
46279 if (ResElts * SVT.getSizeInBits() < 128)
46280 ResElts = 128 / SVT.getSizeInBits();
46281 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
46282 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
46283 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
46284 DAG.getIntPtrConstant(0, DL));
46285 }
46286 }
46287
46288 return SDValue();
46289}
46290
46291/// This function detects the AVG pattern between vectors of unsigned i8/i16,
46292/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
46293/// X86ISD::AVG instruction.
46294static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
46295 const X86Subtarget &Subtarget,
46296 const SDLoc &DL) {
46297 if (!VT.isVector())
46298 return SDValue();
46299 EVT InVT = In.getValueType();
46300 unsigned NumElems = VT.getVectorNumElements();
46301
46302 EVT ScalarVT = VT.getVectorElementType();
46303 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
46304 return SDValue();
46305
46306 // InScalarVT is the intermediate type in AVG pattern and it should be greater
46307 // than the original input type (i8/i16).
46308 EVT InScalarVT = InVT.getVectorElementType();
46309 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
46310 return SDValue();
46311
46312 if (!Subtarget.hasSSE2())
46313 return SDValue();
46314
46315 // Detect the following pattern:
46316 //
46317 // %1 = zext <N x i8> %a to <N x i32>
46318 // %2 = zext <N x i8> %b to <N x i32>
46319 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
46320 // %4 = add nuw nsw <N x i32> %3, %2
46321 // %5 = lshr <N x i32> %N, <i32 1 x N>
46322 // %6 = trunc <N x i32> %5 to <N x i8>
46323 //
46324 // In AVX512, the last instruction can also be a trunc store.
46325 if (In.getOpcode() != ISD::SRL)
46326 return SDValue();
46327
46328 // A lambda checking the given SDValue is a constant vector and each element
46329 // is in the range [Min, Max].
46330 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
46331 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
46332 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
46333 });
46334 };
46335
46336 // Check if each element of the vector is right-shifted by one.
46337 SDValue LHS = In.getOperand(0);
46338 SDValue RHS = In.getOperand(1);
46339 if (!IsConstVectorInRange(RHS, 1, 1))
46340 return SDValue();
46341 if (LHS.getOpcode() != ISD::ADD)
46342 return SDValue();
46343
46344 // Detect a pattern of a + b + 1 where the order doesn't matter.
46345 SDValue Operands[3];
46346 Operands[0] = LHS.getOperand(0);
46347 Operands[1] = LHS.getOperand(1);
46348
46349 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46350 ArrayRef<SDValue> Ops) {
46351 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
46352 };
46353
46354 auto AVGSplitter = [&](SDValue Op0, SDValue Op1) {
46355 // Pad to a power-of-2 vector, split+apply and extract the original vector.
46356 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
46357 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
46358 if (NumElemsPow2 != NumElems) {
46359 SmallVector<SDValue, 32> Ops0(NumElemsPow2, DAG.getUNDEF(ScalarVT));
46360 SmallVector<SDValue, 32> Ops1(NumElemsPow2, DAG.getUNDEF(ScalarVT));
46361 for (unsigned i = 0; i != NumElems; ++i) {
46362 SDValue Idx = DAG.getIntPtrConstant(i, DL);
46363 Ops0[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op0, Idx);
46364 Ops1[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op1, Idx);
46365 }
46366 Op0 = DAG.getBuildVector(Pow2VT, DL, Ops0);
46367 Op1 = DAG.getBuildVector(Pow2VT, DL, Ops1);
46368 }
46369 SDValue Res =
46370 SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, {Op0, Op1}, AVGBuilder);
46371 if (NumElemsPow2 == NumElems)
46372 return Res;
46373 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
46374 DAG.getIntPtrConstant(0, DL));
46375 };
46376
46377 // Take care of the case when one of the operands is a constant vector whose
46378 // element is in the range [1, 256].
46379 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
46380 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
46381 Operands[0].getOperand(0).getValueType() == VT) {
46382 // The pattern is detected. Subtract one from the constant vector, then
46383 // demote it and emit X86ISD::AVG instruction.
46384 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
46385 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
46386 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
46387 return AVGSplitter(Operands[0].getOperand(0), Operands[1]);
46388 }
46389
46390 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
46391 // Match the or case only if its 'add-like' - can be replaced by an add.
46392 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
46393 if (ISD::ADD == V.getOpcode()) {
46394 Op0 = V.getOperand(0);
46395 Op1 = V.getOperand(1);
46396 return true;
46397 }
46398 if (ISD::ZERO_EXTEND != V.getOpcode())
46399 return false;
46400 V = V.getOperand(0);
46401 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
46402 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
46403 return false;
46404 Op0 = V.getOperand(0);
46405 Op1 = V.getOperand(1);
46406 return true;
46407 };
46408
46409 SDValue Op0, Op1;
46410 if (FindAddLike(Operands[0], Op0, Op1))
46411 std::swap(Operands[0], Operands[1]);
46412 else if (!FindAddLike(Operands[1], Op0, Op1))
46413 return SDValue();
46414 Operands[2] = Op0;
46415 Operands[1] = Op1;
46416
46417 // Now we have three operands of two additions. Check that one of them is a
46418 // constant vector with ones, and the other two can be promoted from i8/i16.
46419 for (int i = 0; i < 3; ++i) {
46420 if (!IsConstVectorInRange(Operands[i], 1, 1))
46421 continue;
46422 std::swap(Operands[i], Operands[2]);
46423
46424 // Check if Operands[0] and Operands[1] are results of type promotion.
46425 for (int j = 0; j < 2; ++j)
46426 if (Operands[j].getValueType() != VT) {
46427 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
46428 Operands[j].getOperand(0).getValueType() != VT)
46429 return SDValue();
46430 Operands[j] = Operands[j].getOperand(0);
46431 }
46432
46433 // The pattern is detected, emit X86ISD::AVG instruction(s).
46434 return AVGSplitter(Operands[0], Operands[1]);
46435 }
46436
46437 return SDValue();
46438}
46439
46440static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
46441 TargetLowering::DAGCombinerInfo &DCI,
46442 const X86Subtarget &Subtarget) {
46443 LoadSDNode *Ld = cast<LoadSDNode>(N);
46444 EVT RegVT = Ld->getValueType(0);
46445 EVT MemVT = Ld->getMemoryVT();
46446 SDLoc dl(Ld);
46447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46448
46449 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
46450 // into two 16-byte operations. Also split non-temporal aligned loads on
46451 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
46452 ISD::LoadExtType Ext = Ld->getExtensionType();
46453 bool Fast;
46454 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
46455 Ext == ISD::NON_EXTLOAD &&
46456 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
46457 Ld->getAlignment() >= 16) ||
46458 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
46459 *Ld->getMemOperand(), &Fast) &&
46460 !Fast))) {
46461 unsigned NumElems = RegVT.getVectorNumElements();
46462 if (NumElems < 2)
46463 return SDValue();
46464
46465 unsigned HalfOffset = 16;
46466 SDValue Ptr1 = Ld->getBasePtr();
46467 SDValue Ptr2 =
46468 DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);
46469 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
46470 NumElems / 2);
46471 SDValue Load1 =
46472 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
46473 Ld->getOriginalAlign(),
46474 Ld->getMemOperand()->getFlags());
46475 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
46476 Ld->getPointerInfo().getWithOffset(HalfOffset),
46477 Ld->getOriginalAlign(),
46478 Ld->getMemOperand()->getFlags());
46479 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
46480 Load1.getValue(1), Load2.getValue(1));
46481
46482 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
46483 return DCI.CombineTo(N, NewVec, TF, true);
46484 }
46485
46486 // Bool vector load - attempt to cast to an integer, as we have good
46487 // (vXiY *ext(vXi1 bitcast(iX))) handling.
46488 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
46489 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
46490 unsigned NumElts = RegVT.getVectorNumElements();
46491 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
46492 if (TLI.isTypeLegal(IntVT)) {
46493 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
46494 Ld->getPointerInfo(),
46495 Ld->getOriginalAlign(),
46496 Ld->getMemOperand()->getFlags());
46497 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
46498 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
46499 }
46500 }
46501
46502 // If we also broadcast this as a subvector to a wider type, then just extract
46503 // the lowest subvector.
46504 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
46505 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
46506 SDValue Ptr = Ld->getBasePtr();
46507 SDValue Chain = Ld->getChain();
46508 for (SDNode *User : Ptr->uses()) {
46509 if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
46510 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
46511 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
46512 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
46513 MemVT.getSizeInBits() &&
46514 !User->hasAnyUseOfValue(1) &&
46515 User->getValueSizeInBits(0).getFixedSize() >
46516 RegVT.getFixedSizeInBits()) {
46517 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
46518 RegVT.getSizeInBits());
46519 Extract = DAG.getBitcast(RegVT, Extract);
46520 return DCI.CombineTo(N, Extract, SDValue(User, 1));
46521 }
46522 }
46523 }
46524
46525 // Cast ptr32 and ptr64 pointers to the default address space before a load.
46526 unsigned AddrSpace = Ld->getAddressSpace();
46527 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
46528 AddrSpace == X86AS::PTR32_UPTR) {
46529 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
46530 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
46531 SDValue Cast =
46532 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
46533 return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
46534 Ld->getOriginalAlign(),
46535 Ld->getMemOperand()->getFlags());
46536 }
46537 }
46538
46539 return SDValue();
46540}
46541
46542/// If V is a build vector of boolean constants and exactly one of those
46543/// constants is true, return the operand index of that true element.
46544/// Otherwise, return -1.
46545static int getOneTrueElt(SDValue V) {
46546 // This needs to be a build vector of booleans.
46547 // TODO: Checking for the i1 type matches the IR definition for the mask,
46548 // but the mask check could be loosened to i8 or other types. That might
46549 // also require checking more than 'allOnesValue'; eg, the x86 HW
46550 // instructions only require that the MSB is set for each mask element.
46551 // The ISD::MSTORE comments/definition do not specify how the mask operand
46552 // is formatted.
46553 auto *BV = dyn_cast<BuildVectorSDNode>(V);
46554 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
46555 return -1;
46556
46557 int TrueIndex = -1;
46558 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
46559 for (unsigned i = 0; i < NumElts; ++i) {
46560 const SDValue &Op = BV->getOperand(i);
46561 if (Op.isUndef())
46562 continue;
46563 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
46564 if (!ConstNode)
46565 return -1;
46566 if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {
46567 // If we already found a one, this is too many.
46568 if (TrueIndex >= 0)
46569 return -1;
46570 TrueIndex = i;
46571 }
46572 }
46573 return TrueIndex;
46574}
46575
46576/// Given a masked memory load/store operation, return true if it has one mask
46577/// bit set. If it has one mask bit set, then also return the memory address of
46578/// the scalar element to load/store, the vector index to insert/extract that
46579/// scalar element, and the alignment for the scalar memory access.
46580static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
46581 SelectionDAG &DAG, SDValue &Addr,
46582 SDValue &Index, Align &Alignment,
46583 unsigned &Offset) {
46584 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
46585 if (TrueMaskElt < 0)
46586 return false;
46587
46588 // Get the address of the one scalar element that is specified by the mask
46589 // using the appropriate offset from the base pointer.
46590 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
46591 Offset = 0;
46592 Addr = MaskedOp->getBasePtr();
46593 if (TrueMaskElt != 0) {
46594 Offset = TrueMaskElt * EltVT.getStoreSize();
46595 Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),
46596 SDLoc(MaskedOp));
46597 }
46598
46599 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
46600 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
46601 EltVT.getStoreSize());
46602 return true;
46603}
46604
46605/// If exactly one element of the mask is set for a non-extending masked load,
46606/// it is a scalar load and vector insert.
46607/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
46608/// mask have already been optimized in IR, so we don't bother with those here.
46609static SDValue
46610reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
46611 TargetLowering::DAGCombinerInfo &DCI,
46612 const X86Subtarget &Subtarget) {
46613 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46613, __extension__ __PRETTY_FUNCTION__))
;
46614 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
46615 // However, some target hooks may need to be added to know when the transform
46616 // is profitable. Endianness would also have to be considered.
46617
46618 SDValue Addr, VecIndex;
46619 Align Alignment;
46620 unsigned Offset;
46621 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
46622 return SDValue();
46623
46624 // Load the one scalar element that is specified by the mask using the
46625 // appropriate offset from the base pointer.
46626 SDLoc DL(ML);
46627 EVT VT = ML->getValueType(0);
46628 EVT EltVT = VT.getVectorElementType();
46629
46630 EVT CastVT = VT;
46631 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
46632 EltVT = MVT::f64;
46633 CastVT = VT.changeVectorElementType(EltVT);
46634 }
46635
46636 SDValue Load =
46637 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
46638 ML->getPointerInfo().getWithOffset(Offset),
46639 Alignment, ML->getMemOperand()->getFlags());
46640
46641 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
46642
46643 // Insert the loaded element into the appropriate place in the vector.
46644 SDValue Insert =
46645 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
46646 Insert = DAG.getBitcast(VT, Insert);
46647 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
46648}
46649
46650static SDValue
46651combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
46652 TargetLowering::DAGCombinerInfo &DCI) {
46653 assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46653, __extension__ __PRETTY_FUNCTION__))
;
46654 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
46655 return SDValue();
46656
46657 SDLoc DL(ML);
46658 EVT VT = ML->getValueType(0);
46659
46660 // If we are loading the first and last elements of a vector, it is safe and
46661 // always faster to load the whole vector. Replace the masked load with a
46662 // vector load and select.
46663 unsigned NumElts = VT.getVectorNumElements();
46664 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
46665 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
46666 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
46667 if (LoadFirstElt && LoadLastElt) {
46668 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
46669 ML->getMemOperand());
46670 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
46671 ML->getPassThru());
46672 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
46673 }
46674
46675 // Convert a masked load with a constant mask into a masked load and a select.
46676 // This allows the select operation to use a faster kind of select instruction
46677 // (for example, vblendvps -> vblendps).
46678
46679 // Don't try this if the pass-through operand is already undefined. That would
46680 // cause an infinite loop because that's what we're about to create.
46681 if (ML->getPassThru().isUndef())
46682 return SDValue();
46683
46684 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
46685 return SDValue();
46686
46687 // The new masked load has an undef pass-through operand. The select uses the
46688 // original pass-through operand.
46689 SDValue NewML = DAG.getMaskedLoad(
46690 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
46691 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
46692 ML->getAddressingMode(), ML->getExtensionType());
46693 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
46694 ML->getPassThru());
46695
46696 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
46697}
46698
46699static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
46700 TargetLowering::DAGCombinerInfo &DCI,
46701 const X86Subtarget &Subtarget) {
46702 auto *Mld = cast<MaskedLoadSDNode>(N);
46703
46704 // TODO: Expanding load with constant mask may be optimized as well.
46705 if (Mld->isExpandingLoad())
46706 return SDValue();
46707
46708 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
46709 if (SDValue ScalarLoad =
46710 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
46711 return ScalarLoad;
46712
46713 // TODO: Do some AVX512 subsets benefit from this transform?
46714 if (!Subtarget.hasAVX512())
46715 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
46716 return Blend;
46717 }
46718
46719 // If the mask value has been legalized to a non-boolean vector, try to
46720 // simplify ops leading up to it. We only demand the MSB of each lane.
46721 SDValue Mask = Mld->getMask();
46722 if (Mask.getScalarValueSizeInBits() != 1) {
46723 EVT VT = Mld->getValueType(0);
46724 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46725 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46726 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46727 if (N->getOpcode() != ISD::DELETED_NODE)
46728 DCI.AddToWorklist(N);
46729 return SDValue(N, 0);
46730 }
46731 if (SDValue NewMask =
46732 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46733 return DAG.getMaskedLoad(
46734 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
46735 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
46736 Mld->getAddressingMode(), Mld->getExtensionType());
46737 }
46738
46739 return SDValue();
46740}
46741
46742/// If exactly one element of the mask is set for a non-truncating masked store,
46743/// it is a vector extract and scalar store.
46744/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
46745/// mask have already been optimized in IR, so we don't bother with those here.
46746static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
46747 SelectionDAG &DAG,
46748 const X86Subtarget &Subtarget) {
46749 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
46750 // However, some target hooks may need to be added to know when the transform
46751 // is profitable. Endianness would also have to be considered.
46752
46753 SDValue Addr, VecIndex;
46754 Align Alignment;
46755 unsigned Offset;
46756 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
46757 return SDValue();
46758
46759 // Extract the one scalar element that is actually being stored.
46760 SDLoc DL(MS);
46761 SDValue Value = MS->getValue();
46762 EVT VT = Value.getValueType();
46763 EVT EltVT = VT.getVectorElementType();
46764 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
46765 EltVT = MVT::f64;
46766 EVT CastVT = VT.changeVectorElementType(EltVT);
46767 Value = DAG.getBitcast(CastVT, Value);
46768 }
46769 SDValue Extract =
46770 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
46771
46772 // Store that element at the appropriate offset from the base pointer.
46773 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
46774 MS->getPointerInfo().getWithOffset(Offset),
46775 Alignment, MS->getMemOperand()->getFlags());
46776}
46777
46778static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
46779 TargetLowering::DAGCombinerInfo &DCI,
46780 const X86Subtarget &Subtarget) {
46781 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
46782 if (Mst->isCompressingStore())
46783 return SDValue();
46784
46785 EVT VT = Mst->getValue().getValueType();
46786 SDLoc dl(Mst);
46787 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46788
46789 if (Mst->isTruncatingStore())
46790 return SDValue();
46791
46792 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
46793 return ScalarStore;
46794
46795 // If the mask value has been legalized to a non-boolean vector, try to
46796 // simplify ops leading up to it. We only demand the MSB of each lane.
46797 SDValue Mask = Mst->getMask();
46798 if (Mask.getScalarValueSizeInBits() != 1) {
46799 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
46800 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
46801 if (N->getOpcode() != ISD::DELETED_NODE)
46802 DCI.AddToWorklist(N);
46803 return SDValue(N, 0);
46804 }
46805 if (SDValue NewMask =
46806 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
46807 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
46808 Mst->getBasePtr(), Mst->getOffset(), NewMask,
46809 Mst->getMemoryVT(), Mst->getMemOperand(),
46810 Mst->getAddressingMode());
46811 }
46812
46813 SDValue Value = Mst->getValue();
46814 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
46815 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
46816 Mst->getMemoryVT())) {
46817 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
46818 Mst->getBasePtr(), Mst->getOffset(), Mask,
46819 Mst->getMemoryVT(), Mst->getMemOperand(),
46820 Mst->getAddressingMode(), true);
46821 }
46822
46823 return SDValue();
46824}
46825
46826static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
46827 TargetLowering::DAGCombinerInfo &DCI,
46828 const X86Subtarget &Subtarget) {
46829 StoreSDNode *St = cast<StoreSDNode>(N);
46830 EVT StVT = St->getMemoryVT();
46831 SDLoc dl(St);
46832 SDValue StoredVal = St->getValue();
46833 EVT VT = StoredVal.getValueType();
46834 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46835
46836 // Convert a store of vXi1 into a store of iX and a bitcast.
46837 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
46838 VT.getVectorElementType() == MVT::i1) {
46839
46840 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
46841 StoredVal = DAG.getBitcast(NewVT, StoredVal);
46842
46843 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46844 St->getPointerInfo(), St->getOriginalAlign(),
46845 St->getMemOperand()->getFlags());
46846 }
46847
46848 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
46849 // This will avoid a copy to k-register.
46850 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
46851 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
46852 StoredVal.getOperand(0).getValueType() == MVT::i8) {
46853 SDValue Val = StoredVal.getOperand(0);
46854 // We must store zeros to the unused bits.
46855 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
46856 return DAG.getStore(St->getChain(), dl, Val,
46857 St->getBasePtr(), St->getPointerInfo(),
46858 St->getOriginalAlign(),
46859 St->getMemOperand()->getFlags());
46860 }
46861
46862 // Widen v2i1/v4i1 stores to v8i1.
46863 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
46864 Subtarget.hasAVX512()) {
46865 unsigned NumConcats = 8 / VT.getVectorNumElements();
46866 // We must store zeros to the unused bits.
46867 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
46868 Ops[0] = StoredVal;
46869 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
46870 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46871 St->getPointerInfo(), St->getOriginalAlign(),
46872 St->getMemOperand()->getFlags());
46873 }
46874
46875 // Turn vXi1 stores of constants into a scalar store.
46876 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
46877 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
46878 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
46879 // If its a v64i1 store without 64-bit support, we need two stores.
46880 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
46881 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
46882 StoredVal->ops().slice(0, 32));
46883 Lo = combinevXi1ConstantToInteger(Lo, DAG);
46884 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
46885 StoredVal->ops().slice(32, 32));
46886 Hi = combinevXi1ConstantToInteger(Hi, DAG);
46887
46888 SDValue Ptr0 = St->getBasePtr();
46889 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);
46890
46891 SDValue Ch0 =
46892 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
46893 St->getOriginalAlign(),
46894 St->getMemOperand()->getFlags());
46895 SDValue Ch1 =
46896 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
46897 St->getPointerInfo().getWithOffset(4),
46898 St->getOriginalAlign(),
46899 St->getMemOperand()->getFlags());
46900 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
46901 }
46902
46903 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
46904 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
46905 St->getPointerInfo(), St->getOriginalAlign(),
46906 St->getMemOperand()->getFlags());
46907 }
46908
46909 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
46910 // Sandy Bridge, perform two 16-byte stores.
46911 bool Fast;
46912 if (VT.is256BitVector() && StVT == VT &&
46913 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
46914 *St->getMemOperand(), &Fast) &&
46915 !Fast) {
46916 unsigned NumElems = VT.getVectorNumElements();
46917 if (NumElems < 2)
46918 return SDValue();
46919
46920 return splitVectorStore(St, DAG);
46921 }
46922
46923 // Split under-aligned vector non-temporal stores.
46924 if (St->isNonTemporal() && StVT == VT &&
46925 St->getAlignment() < VT.getStoreSize()) {
46926 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
46927 // vectors or the legalizer can scalarize it to use MOVNTI.
46928 if (VT.is256BitVector() || VT.is512BitVector()) {
46929 unsigned NumElems = VT.getVectorNumElements();
46930 if (NumElems < 2)
46931 return SDValue();
46932 return splitVectorStore(St, DAG);
46933 }
46934
46935 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
46936 // to use MOVNTI.
46937 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
46938 MVT NTVT = Subtarget.hasSSE4A()
46939 ? MVT::v2f64
46940 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
46941 return scalarizeVectorStore(St, NTVT, DAG);
46942 }
46943 }
46944
46945 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
46946 // supported, but avx512f is by extending to v16i32 and truncating.
46947 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
46948 St->getValue().getOpcode() == ISD::TRUNCATE &&
46949 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
46950 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
46951 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
46952 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
46953 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
46954 MVT::v16i8, St->getMemOperand());
46955 }
46956
46957 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
46958 if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
46959 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
46960 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
46961 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
46962 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
46963 return EmitTruncSStore(IsSigned, St->getChain(),
46964 dl, StoredVal.getOperand(0), St->getBasePtr(),
46965 VT, St->getMemOperand(), DAG);
46966 }
46967
46968 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
46969 if (!St->isTruncatingStore() && StoredVal.hasOneUse()) {
46970 auto IsExtractedElement = [](SDValue V) {
46971 if (V.getOpcode() == ISD::TRUNCATE && V.getOperand(0).hasOneUse())
46972 V = V.getOperand(0);
46973 unsigned Opc = V.getOpcode();
46974 if (Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) {
46975 if (V.getOperand(0).hasOneUse() && isNullConstant(V.getOperand(1)))
46976 return V.getOperand(0);
46977 }
46978 return SDValue();
46979 };
46980 if (SDValue Extract = IsExtractedElement(StoredVal)) {
46981 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
46982 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
46983 SDValue Src = Trunc.getOperand(0);
46984 MVT DstVT = Trunc.getSimpleValueType();
46985 MVT SrcVT = Src.getSimpleValueType();
46986 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46987 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
46988 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
46989 if (NumTruncBits == VT.getSizeInBits() &&
46990 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
46991 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
46992 TruncVT, St->getMemOperand());
46993 }
46994 }
46995 }
46996 }
46997
46998 // Optimize trunc store (of multiple scalars) to shuffle and store.
46999 // First, pack all of the elements in one place. Next, store to memory
47000 // in fewer chunks.
47001 if (St->isTruncatingStore() && VT.isVector()) {
47002 // Check if we can detect an AVG pattern from the truncation. If yes,
47003 // replace the trunc store by a normal store with the result of X86ISD::AVG
47004 // instruction.
47005 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
47006 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
47007 Subtarget, dl))
47008 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
47009 St->getPointerInfo(), St->getOriginalAlign(),
47010 St->getMemOperand()->getFlags());
47011
47012 if (TLI.isTruncStoreLegal(VT, StVT)) {
47013 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
47014 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
47015 dl, Val, St->getBasePtr(),
47016 St->getMemoryVT(), St->getMemOperand(), DAG);
47017 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
47018 DAG, dl))
47019 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
47020 dl, Val, St->getBasePtr(),
47021 St->getMemoryVT(), St->getMemOperand(), DAG);
47022 }
47023
47024 return SDValue();
47025 }
47026
47027 // Cast ptr32 and ptr64 pointers to the default address space before a store.
47028 unsigned AddrSpace = St->getAddressSpace();
47029 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
47030 AddrSpace == X86AS::PTR32_UPTR) {
47031 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
47032 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
47033 SDValue Cast =
47034 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
47035 return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
47036 St->getPointerInfo(), St->getOriginalAlign(),
47037 St->getMemOperand()->getFlags(), St->getAAInfo());
47038 }
47039 }
47040
47041 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
47042 // the FP state in cases where an emms may be missing.
47043 // A preferable solution to the general problem is to figure out the right
47044 // places to insert EMMS. This qualifies as a quick hack.
47045
47046 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
47047 if (VT.getSizeInBits() != 64)
47048 return SDValue();
47049
47050 const Function &F = DAG.getMachineFunction().getFunction();
47051 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
47052 bool F64IsLegal =
47053 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
47054 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
47055 isa<LoadSDNode>(St->getValue()) &&
47056 cast<LoadSDNode>(St->getValue())->isSimple() &&
47057 St->getChain().hasOneUse() && St->isSimple()) {
47058 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
47059
47060 if (!ISD::isNormalLoad(Ld))
47061 return SDValue();
47062
47063 // Avoid the transformation if there are multiple uses of the loaded value.
47064 if (!Ld->hasNUsesOfValue(1, 0))
47065 return SDValue();
47066
47067 SDLoc LdDL(Ld);
47068 SDLoc StDL(N);
47069 // Lower to a single movq load/store pair.
47070 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
47071 Ld->getBasePtr(), Ld->getMemOperand());
47072
47073 // Make sure new load is placed in same chain order.
47074 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
47075 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
47076 St->getMemOperand());
47077 }
47078
47079 // This is similar to the above case, but here we handle a scalar 64-bit
47080 // integer store that is extracted from a vector on a 32-bit target.
47081 // If we have SSE2, then we can treat it like a floating-point double
47082 // to get past legalization. The execution dependencies fixup pass will
47083 // choose the optimal machine instruction for the store if this really is
47084 // an integer or v2f32 rather than an f64.
47085 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
47086 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
47087 SDValue OldExtract = St->getOperand(1);
47088 SDValue ExtOp0 = OldExtract.getOperand(0);
47089 unsigned VecSize = ExtOp0.getValueSizeInBits();
47090 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
47091 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
47092 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
47093 BitCast, OldExtract.getOperand(1));
47094 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
47095 St->getPointerInfo(), St->getOriginalAlign(),
47096 St->getMemOperand()->getFlags());
47097 }
47098
47099 return SDValue();
47100}
47101
47102static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
47103 TargetLowering::DAGCombinerInfo &DCI,
47104 const X86Subtarget &Subtarget) {
47105 auto *St = cast<MemIntrinsicSDNode>(N);
47106
47107 SDValue StoredVal = N->getOperand(1);
47108 MVT VT = StoredVal.getSimpleValueType();
47109 EVT MemVT = St->getMemoryVT();
47110
47111 // Figure out which elements we demand.
47112 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
47113 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
47114
47115 APInt KnownUndef, KnownZero;
47116 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47117 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
47118 KnownZero, DCI)) {
47119 if (N->getOpcode() != ISD::DELETED_NODE)
47120 DCI.AddToWorklist(N);
47121 return SDValue(N, 0);
47122 }
47123
47124 return SDValue();
47125}
47126
47127/// Return 'true' if this vector operation is "horizontal"
47128/// and return the operands for the horizontal operation in LHS and RHS. A
47129/// horizontal operation performs the binary operation on successive elements
47130/// of its first operand, then on successive elements of its second operand,
47131/// returning the resulting values in a vector. For example, if
47132/// A = < float a0, float a1, float a2, float a3 >
47133/// and
47134/// B = < float b0, float b1, float b2, float b3 >
47135/// then the result of doing a horizontal operation on A and B is
47136/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
47137/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
47138/// A horizontal-op B, for some already available A and B, and if so then LHS is
47139/// set to A, RHS to B, and the routine returns 'true'.
47140static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
47141 SelectionDAG &DAG, const X86Subtarget &Subtarget,
47142 bool IsCommutative,
47143 SmallVectorImpl<int> &PostShuffleMask) {
47144 // If either operand is undef, bail out. The binop should be simplified.
47145 if (LHS.isUndef() || RHS.isUndef())
47146 return false;
47147
47148 // Look for the following pattern:
47149 // A = < float a0, float a1, float a2, float a3 >
47150 // B = < float b0, float b1, float b2, float b3 >
47151 // and
47152 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
47153 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
47154 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
47155 // which is A horizontal-op B.
47156
47157 MVT VT = LHS.getSimpleValueType();
47158 assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47159, __extension__ __PRETTY_FUNCTION__))
47159 "Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47159, __extension__ __PRETTY_FUNCTION__))
;
47160 unsigned NumElts = VT.getVectorNumElements();
47161
47162 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
47163 SmallVectorImpl<int> &ShuffleMask) {
47164 bool UseSubVector = false;
47165 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
47166 Op.getOperand(0).getValueType().is256BitVector() &&
47167 llvm::isNullConstant(Op.getOperand(1))) {
47168 Op = Op.getOperand(0);
47169 UseSubVector = true;
47170 }
47171 SmallVector<SDValue, 2> SrcOps;
47172 SmallVector<int, 16> SrcMask, ScaledMask;
47173 SDValue BC = peekThroughBitcasts(Op);
47174 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
47175 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
47176 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
47177 })) {
47178 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
47179 if (!UseSubVector && SrcOps.size() <= 2 &&
47180 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
47181 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
47182 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
47183 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
47184 }
47185 if (UseSubVector && SrcOps.size() == 1 &&
47186 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
47187 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
47188 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
47189 ShuffleMask.assign(Mask.begin(), Mask.end());
47190 }
47191 }
47192 };
47193
47194 // View LHS in the form
47195 // LHS = VECTOR_SHUFFLE A, B, LMask
47196 // If LHS is not a shuffle, then pretend it is the identity shuffle:
47197 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
47198 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
47199 SDValue A, B;
47200 SmallVector<int, 16> LMask;
47201 GetShuffle(LHS, A, B, LMask);
47202
47203 // Likewise, view RHS in the form
47204 // RHS = VECTOR_SHUFFLE C, D, RMask
47205 SDValue C, D;
47206 SmallVector<int, 16> RMask;
47207 GetShuffle(RHS, C, D, RMask);
47208
47209 // At least one of the operands should be a vector shuffle.
47210 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
47211 if (NumShuffles == 0)
47212 return false;
47213
47214 if (LMask.empty()) {
47215 A = LHS;
47216 for (unsigned i = 0; i != NumElts; ++i)
47217 LMask.push_back(i);
47218 }
47219
47220 if (RMask.empty()) {
47221 C = RHS;
47222 for (unsigned i = 0; i != NumElts; ++i)
47223 RMask.push_back(i);
47224 }
47225
47226 // If we have an unary mask, ensure the other op is set to null.
47227 if (isUndefOrInRange(LMask, 0, NumElts))
47228 B = SDValue();
47229 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
47230 A = SDValue();
47231
47232 if (isUndefOrInRange(RMask, 0, NumElts))
47233 D = SDValue();
47234 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
47235 C = SDValue();
47236
47237 // If A and B occur in reverse order in RHS, then canonicalize by commuting
47238 // RHS operands and shuffle mask.
47239 if (A != C) {
47240 std::swap(C, D);
47241 ShuffleVectorSDNode::commuteMask(RMask);
47242 }
47243 // Check that the shuffles are both shuffling the same vectors.
47244 if (!(A == C && B == D))
47245 return false;
47246
47247 PostShuffleMask.clear();
47248 PostShuffleMask.append(NumElts, SM_SentinelUndef);
47249
47250 // LHS and RHS are now:
47251 // LHS = shuffle A, B, LMask
47252 // RHS = shuffle A, B, RMask
47253 // Check that the masks correspond to performing a horizontal operation.
47254 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
47255 // so we just repeat the inner loop if this is a 256-bit op.
47256 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
47257 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
47258 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
47259 assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47260, __extension__ __PRETTY_FUNCTION__))
47260 "Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47260, __extension__ __PRETTY_FUNCTION__))
;
47261 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
47262 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
47263 // Ignore undefined components.
47264 int LIdx = LMask[i + j], RIdx = RMask[i + j];
47265 if (LIdx < 0 || RIdx < 0 ||
47266 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
47267 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
47268 continue;
47269
47270 // Check that successive odd/even elements are being operated on. If not,
47271 // this is not a horizontal operation.
47272 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
47273 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
47274 return false;
47275
47276 // Compute the post-shuffle mask index based on where the element
47277 // is stored in the HOP result, and where it needs to be moved to.
47278 int Base = LIdx & ~1u;
47279 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
47280 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
47281
47282 // The low half of the 128-bit result must choose from A.
47283 // The high half of the 128-bit result must choose from B,
47284 // unless B is undef. In that case, we are always choosing from A.
47285 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
47286 Index += NumEltsPer64BitChunk;
47287 PostShuffleMask[i + j] = Index;
47288 }
47289 }
47290
47291 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
47292 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
47293
47294 bool IsIdentityPostShuffle =
47295 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
47296 if (IsIdentityPostShuffle)
47297 PostShuffleMask.clear();
47298
47299 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
47300 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
47301 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
47302 return false;
47303
47304 // If the source nodes are already used in HorizOps then always accept this.
47305 // Shuffle folding should merge these back together.
47306 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
47307 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
47308 });
47309 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
47310 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
47311 });
47312 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
47313
47314 // Assume a SingleSource HOP if we only shuffle one input and don't need to
47315 // shuffle the result.
47316 if (!ForceHorizOp &&
47317 !shouldUseHorizontalOp(NewLHS == NewRHS &&
47318 (NumShuffles < 2 || !IsIdentityPostShuffle),
47319 DAG, Subtarget))
47320 return false;
47321
47322 LHS = DAG.getBitcast(VT, NewLHS);
47323 RHS = DAG.getBitcast(VT, NewRHS);
47324 return true;
47325}
47326
47327// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
47328static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
47329 const X86Subtarget &Subtarget) {
47330 EVT VT = N->getValueType(0);
47331 unsigned Opcode = N->getOpcode();
47332 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
47333 SmallVector<int, 8> PostShuffleMask;
47334
47335 switch (Opcode) {
47336 case ISD::FADD:
47337 case ISD::FSUB:
47338 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
47339 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
47340 SDValue LHS = N->getOperand(0);
47341 SDValue RHS = N->getOperand(1);
47342 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
47343 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
47344 PostShuffleMask)) {
47345 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
47346 if (!PostShuffleMask.empty())
47347 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
47348 DAG.getUNDEF(VT), PostShuffleMask);
47349 return HorizBinOp;
47350 }
47351 }
47352 break;
47353 case ISD::ADD:
47354 case ISD::SUB:
47355 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
47356 VT == MVT::v16i16 || VT == MVT::v8i32)) {
47357 SDValue LHS = N->getOperand(0);
47358 SDValue RHS = N->getOperand(1);
47359 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
47360 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
47361 PostShuffleMask)) {
47362 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
47363 ArrayRef<SDValue> Ops) {
47364 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
47365 };
47366 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
47367 {LHS, RHS}, HOpBuilder);
47368 if (!PostShuffleMask.empty())
47369 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
47370 DAG.getUNDEF(VT), PostShuffleMask);
47371 return HorizBinOp;
47372 }
47373 }
47374 break;
47375 }
47376
47377 return SDValue();
47378}
47379
47380/// Do target-specific dag combines on floating-point adds/subs.
47381static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
47382 const X86Subtarget &Subtarget) {
47383 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
47384 return HOp;
47385 return SDValue();
47386}
47387
47388/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
47389/// the codegen.
47390/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
47391/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
47392/// anything that is guaranteed to be transformed by DAGCombiner.
47393static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
47394 const X86Subtarget &Subtarget,
47395 const SDLoc &DL) {
47396 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47396, __extension__ __PRETTY_FUNCTION__))
;
47397 SDValue Src = N->getOperand(0);
47398 unsigned SrcOpcode = Src.getOpcode();
47399 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47400
47401 EVT VT = N->getValueType(0);
47402 EVT SrcVT = Src.getValueType();
47403
47404 auto IsFreeTruncation = [VT](SDValue Op) {
47405 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
47406
47407 // See if this has been extended from a smaller/equal size to
47408 // the truncation size, allowing a truncation to combine with the extend.
47409 unsigned Opcode = Op.getOpcode();
47410 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
47411 Opcode == ISD::ZERO_EXTEND) &&
47412 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
47413 return true;
47414
47415 // See if this is a single use constant which can be constant folded.
47416 // NOTE: We don't peek throught bitcasts here because there is currently
47417 // no support for constant folding truncate+bitcast+vector_of_constants. So
47418 // we'll just send up with a truncate on both operands which will
47419 // get turned back into (truncate (binop)) causing an infinite loop.
47420 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
47421 };
47422
47423 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
47424 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
47425 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
47426 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
47427 };
47428
47429 // Don't combine if the operation has other uses.
47430 if (!Src.hasOneUse())
47431 return SDValue();
47432
47433 // Only support vector truncation for now.
47434 // TODO: i64 scalar math would benefit as well.
47435 if (!VT.isVector())
47436 return SDValue();
47437
47438 // In most cases its only worth pre-truncating if we're only facing the cost
47439 // of one truncation.
47440 // i.e. if one of the inputs will constant fold or the input is repeated.
47441 switch (SrcOpcode) {
47442 case ISD::MUL:
47443 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
47444 // better to truncate if we have the chance.
47445 if (SrcVT.getScalarType() == MVT::i64 &&
47446 TLI.isOperationLegal(SrcOpcode, VT) &&
47447 !TLI.isOperationLegal(SrcOpcode, SrcVT))
47448 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
47449 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47450 case ISD::AND:
47451 case ISD::XOR:
47452 case ISD::OR:
47453 case ISD::ADD:
47454 case ISD::SUB: {
47455 SDValue Op0 = Src.getOperand(0);
47456 SDValue Op1 = Src.getOperand(1);
47457 if (TLI.isOperationLegal(SrcOpcode, VT) &&
47458 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
47459 return TruncateArithmetic(Op0, Op1);
47460 break;
47461 }
47462 }
47463
47464 return SDValue();
47465}
47466
47467/// Truncate using ISD::AND mask and X86ISD::PACKUS.
47468/// e.g. trunc <8 x i32> X to <8 x i16> -->
47469/// MaskX = X & 0xffff (clear high bits to prevent saturation)
47470/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
47471static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
47472 const X86Subtarget &Subtarget,
47473 SelectionDAG &DAG) {
47474 SDValue In = N->getOperand(0);
47475 EVT InVT = In.getValueType();
47476 EVT OutVT = N->getValueType(0);
47477
47478 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
47479 OutVT.getScalarSizeInBits());
47480 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
47481 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
47482}
47483
47484/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
47485static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
47486 const X86Subtarget &Subtarget,
47487 SelectionDAG &DAG) {
47488 SDValue In = N->getOperand(0);
47489 EVT InVT = In.getValueType();
47490 EVT OutVT = N->getValueType(0);
47491 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
47492 DAG.getValueType(OutVT));
47493 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
47494}
47495
47496/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
47497/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
47498/// legalization the truncation will be translated into a BUILD_VECTOR with each
47499/// element that is extracted from a vector and then truncated, and it is
47500/// difficult to do this optimization based on them.
47501static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
47502 const X86Subtarget &Subtarget) {
47503 EVT OutVT = N->getValueType(0);
47504 if (!OutVT.isVector())
47505 return SDValue();
47506
47507 SDValue In = N->getOperand(0);
47508 if (!In.getValueType().isSimple())
47509 return SDValue();
47510
47511 EVT InVT = In.getValueType();
47512 unsigned NumElems = OutVT.getVectorNumElements();
47513
47514 // AVX512 provides fast truncate ops.
47515 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47516 return SDValue();
47517
47518 EVT OutSVT = OutVT.getVectorElementType();
47519 EVT InSVT = InVT.getVectorElementType();
47520 if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
47521 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
47522 NumElems >= 8))
47523 return SDValue();
47524
47525 // SSSE3's pshufb results in less instructions in the cases below.
47526 if (Subtarget.hasSSSE3() && NumElems == 8 && InSVT != MVT::i64)
47527 return SDValue();
47528
47529 SDLoc DL(N);
47530 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
47531 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
47532 // truncate 2 x v4i32 to v8i16.
47533 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
47534 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
47535 if (InSVT == MVT::i32)
47536 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
47537
47538 return SDValue();
47539}
47540
47541/// This function transforms vector truncation of 'extended sign-bits' or
47542/// 'extended zero-bits' values.
47543/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
47544static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
47545 SelectionDAG &DAG,
47546 const X86Subtarget &Subtarget) {
47547 // Requires SSE2.
47548 if (!Subtarget.hasSSE2())
47549 return SDValue();
47550
47551 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
47552 return SDValue();
47553
47554 SDValue In = N->getOperand(0);
47555 if (!In.getValueType().isSimple())
47556 return SDValue();
47557
47558 MVT VT = N->getValueType(0).getSimpleVT();
47559 MVT SVT = VT.getScalarType();
47560
47561 MVT InVT = In.getValueType().getSimpleVT();
47562 MVT InSVT = InVT.getScalarType();
47563
47564 // Check we have a truncation suited for PACKSS/PACKUS.
47565 if (!isPowerOf2_32(VT.getVectorNumElements()))
47566 return SDValue();
47567 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
47568 return SDValue();
47569 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
47570 return SDValue();
47571
47572 // Truncation to sub-128bit vXi32 can be better handled with shuffles.
47573 if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
47574 return SDValue();
47575
47576 // AVX512 has fast truncate, but if the input is already going to be split,
47577 // there's no harm in trying pack.
47578 if (Subtarget.hasAVX512() &&
47579 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
47580 InVT.is512BitVector())) {
47581 // PACK should still be worth it for 128-bit vectors if the sources were
47582 // originally concatenated from subvectors.
47583 SmallVector<SDValue> ConcatOps;
47584 if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
47585 return SDValue();
47586 }
47587
47588 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
47589 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
47590
47591 // Use PACKUS if the input has zero-bits that extend all the way to the
47592 // packed/truncated value. e.g. masks, zext_in_reg, etc.
47593 KnownBits Known = DAG.computeKnownBits(In);
47594 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
47595 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
47596 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
47597
47598 // Use PACKSS if the input has sign-bits that extend all the way to the
47599 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
47600 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
47601
47602 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
47603 // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
47604 // on and combines/simplifications can't then use it.
47605 if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
47606 return SDValue();
47607
47608 unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;
47609 if (NumSignBits > MinSignBits)
47610 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
47611
47612 // If we have a srl that only generates signbits that we will discard in
47613 // the truncation then we can use PACKSS by converting the srl to a sra.
47614 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
47615 if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))
47616 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(
47617 In, APInt::getAllOnesValue(VT.getVectorNumElements()))) {
47618 if (*ShAmt == MinSignBits) {
47619 SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());
47620 return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,
47621 Subtarget);
47622 }
47623 }
47624
47625 return SDValue();
47626}
47627
47628// Try to form a MULHU or MULHS node by looking for
47629// (trunc (srl (mul ext, ext), 16))
47630// TODO: This is X86 specific because we want to be able to handle wide types
47631// before type legalization. But we can only do it if the vector will be
47632// legalized via widening/splitting. Type legalization can't handle promotion
47633// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
47634// combiner.
47635static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
47636 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47637 // First instruction should be a right shift of a multiply.
47638 if (Src.getOpcode() != ISD::SRL ||
47639 Src.getOperand(0).getOpcode() != ISD::MUL)
47640 return SDValue();
47641
47642 if (!Subtarget.hasSSE2())
47643 return SDValue();
47644
47645 // Only handle vXi16 types that are at least 128-bits unless they will be
47646 // widened.
47647 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
47648 return SDValue();
47649
47650 // Input type should be at least vXi32.
47651 EVT InVT = Src.getValueType();
47652 if (InVT.getVectorElementType().getSizeInBits() < 32)
47653 return SDValue();
47654
47655 // Need a shift by 16.
47656 APInt ShiftAmt;
47657 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
47658 ShiftAmt != 16)
47659 return SDValue();
47660
47661 SDValue LHS = Src.getOperand(0).getOperand(0);
47662 SDValue RHS = Src.getOperand(0).getOperand(1);
47663
47664 unsigned ExtOpc = LHS.getOpcode();
47665 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
47666 RHS.getOpcode() != ExtOpc)
47667 return SDValue();
47668
47669 // Peek through the extends.
47670 LHS = LHS.getOperand(0);
47671 RHS = RHS.getOperand(0);
47672
47673 // Ensure the input types match.
47674 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
47675 return SDValue();
47676
47677 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
47678 return DAG.getNode(Opc, DL, VT, LHS, RHS);
47679}
47680
47681// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
47682// from one vector with signed bytes from another vector, adds together
47683// adjacent pairs of 16-bit products, and saturates the result before
47684// truncating to 16-bits.
47685//
47686// Which looks something like this:
47687// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
47688// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
47689static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
47690 const X86Subtarget &Subtarget,
47691 const SDLoc &DL) {
47692 if (!VT.isVector() || !Subtarget.hasSSSE3())
47693 return SDValue();
47694
47695 unsigned NumElems = VT.getVectorNumElements();
47696 EVT ScalarVT = VT.getVectorElementType();
47697 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
47698 return SDValue();
47699
47700 SDValue SSatVal = detectSSatPattern(In, VT);
47701 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
47702 return SDValue();
47703
47704 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
47705 // of multiplies from even/odd elements.
47706 SDValue N0 = SSatVal.getOperand(0);
47707 SDValue N1 = SSatVal.getOperand(1);
47708
47709 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
47710 return SDValue();
47711
47712 SDValue N00 = N0.getOperand(0);
47713 SDValue N01 = N0.getOperand(1);
47714 SDValue N10 = N1.getOperand(0);
47715 SDValue N11 = N1.getOperand(1);
47716
47717 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
47718 // Canonicalize zero_extend to LHS.
47719 if (N01.getOpcode() == ISD::ZERO_EXTEND)
47720 std::swap(N00, N01);
47721 if (N11.getOpcode() == ISD::ZERO_EXTEND)
47722 std::swap(N10, N11);
47723
47724 // Ensure we have a zero_extend and a sign_extend.
47725 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
47726 N01.getOpcode() != ISD::SIGN_EXTEND ||
47727 N10.getOpcode() != ISD::ZERO_EXTEND ||
47728 N11.getOpcode() != ISD::SIGN_EXTEND)
47729 return SDValue();
47730
47731 // Peek through the extends.
47732 N00 = N00.getOperand(0);
47733 N01 = N01.getOperand(0);
47734 N10 = N10.getOperand(0);
47735 N11 = N11.getOperand(0);
47736
47737 // Ensure the extend is from vXi8.
47738 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
47739 N01.getValueType().getVectorElementType() != MVT::i8 ||
47740 N10.getValueType().getVectorElementType() != MVT::i8 ||
47741 N11.getValueType().getVectorElementType() != MVT::i8)
47742 return SDValue();
47743
47744 // All inputs should be build_vectors.
47745 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
47746 N01.getOpcode() != ISD::BUILD_VECTOR ||
47747 N10.getOpcode() != ISD::BUILD_VECTOR ||
47748 N11.getOpcode() != ISD::BUILD_VECTOR)
47749 return SDValue();
47750
47751 // N00/N10 are zero extended. N01/N11 are sign extended.
47752
47753 // For each element, we need to ensure we have an odd element from one vector
47754 // multiplied by the odd element of another vector and the even element from
47755 // one of the same vectors being multiplied by the even element from the
47756 // other vector. So we need to make sure for each element i, this operator
47757 // is being performed:
47758 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
47759 SDValue ZExtIn, SExtIn;
47760 for (unsigned i = 0; i != NumElems; ++i) {
47761 SDValue N00Elt = N00.getOperand(i);
47762 SDValue N01Elt = N01.getOperand(i);
47763 SDValue N10Elt = N10.getOperand(i);
47764 SDValue N11Elt = N11.getOperand(i);
47765 // TODO: Be more tolerant to undefs.
47766 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47767 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47768 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
47769 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
47770 return SDValue();
47771 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
47772 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
47773 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
47774 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
47775 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
47776 return SDValue();
47777 unsigned IdxN00 = ConstN00Elt->getZExtValue();
47778 unsigned IdxN01 = ConstN01Elt->getZExtValue();
47779 unsigned IdxN10 = ConstN10Elt->getZExtValue();
47780 unsigned IdxN11 = ConstN11Elt->getZExtValue();
47781 // Add is commutative so indices can be reordered.
47782 if (IdxN00 > IdxN10) {
47783 std::swap(IdxN00, IdxN10);
47784 std::swap(IdxN01, IdxN11);
47785 }
47786 // N0 indices be the even element. N1 indices must be the next odd element.
47787 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
47788 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
47789 return SDValue();
47790 SDValue N00In = N00Elt.getOperand(0);
47791 SDValue N01In = N01Elt.getOperand(0);
47792 SDValue N10In = N10Elt.getOperand(0);
47793 SDValue N11In = N11Elt.getOperand(0);
47794 // First time we find an input capture it.
47795 if (!ZExtIn) {
47796 ZExtIn = N00In;
47797 SExtIn = N01In;
47798 }
47799 if (ZExtIn != N00In || SExtIn != N01In ||
47800 ZExtIn != N10In || SExtIn != N11In)
47801 return SDValue();
47802 }
47803
47804 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47805 ArrayRef<SDValue> Ops) {
47806 // Shrink by adding truncate nodes and let DAGCombine fold with the
47807 // sources.
47808 EVT InVT = Ops[0].getValueType();
47809 assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47810, __extension__ __PRETTY_FUNCTION__))
47810 "Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47810, __extension__ __PRETTY_FUNCTION__))
;
47811 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47811, __extension__ __PRETTY_FUNCTION__))
;
47812 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
47813 InVT.getVectorNumElements() / 2);
47814 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
47815 };
47816 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
47817 PMADDBuilder);
47818}
47819
47820static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
47821 const X86Subtarget &Subtarget) {
47822 EVT VT = N->getValueType(0);
47823 SDValue Src = N->getOperand(0);
47824 SDLoc DL(N);
47825
47826 // Attempt to pre-truncate inputs to arithmetic ops instead.
47827 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
47828 return V;
47829
47830 // Try to detect AVG pattern first.
47831 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
47832 return Avg;
47833
47834 // Try to detect PMADD
47835 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
47836 return PMAdd;
47837
47838 // Try to combine truncation with signed/unsigned saturation.
47839 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
47840 return Val;
47841
47842 // Try to combine PMULHUW/PMULHW for vXi16.
47843 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
47844 return V;
47845
47846 // The bitcast source is a direct mmx result.
47847 // Detect bitcasts between i32 to x86mmx
47848 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
47849 SDValue BCSrc = Src.getOperand(0);
47850 if (BCSrc.getValueType() == MVT::x86mmx)
47851 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
47852 }
47853
47854 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
47855 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
47856 return V;
47857
47858 return combineVectorTruncation(N, DAG, Subtarget);
47859}
47860
47861static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
47862 TargetLowering::DAGCombinerInfo &DCI) {
47863 EVT VT = N->getValueType(0);
47864 SDValue In = N->getOperand(0);
47865 SDLoc DL(N);
47866
47867 if (auto SSatVal = detectSSatPattern(In, VT))
47868 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
47869 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
47870 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
47871
47872 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47873 APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
47874 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
47875 return SDValue(N, 0);
47876
47877 return SDValue();
47878}
47879
47880/// Returns the negated value if the node \p N flips sign of FP value.
47881///
47882/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
47883/// or FSUB(0, x)
47884/// AVX512F does not have FXOR, so FNEG is lowered as
47885/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
47886/// In this case we go though all bitcasts.
47887/// This also recognizes splat of a negated value and returns the splat of that
47888/// value.
47889static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
47890 if (N->getOpcode() == ISD::FNEG)
47891 return N->getOperand(0);
47892
47893 // Don't recurse exponentially.
47894 if (Depth > SelectionDAG::MaxRecursionDepth)
47895 return SDValue();
47896
47897 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
47898
47899 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
47900 EVT VT = Op->getValueType(0);
47901
47902 // Make sure the element size doesn't change.
47903 if (VT.getScalarSizeInBits() != ScalarSize)
47904 return SDValue();
47905
47906 unsigned Opc = Op.getOpcode();
47907 switch (Opc) {
47908 case ISD::VECTOR_SHUFFLE: {
47909 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
47910 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
47911 if (!Op.getOperand(1).isUndef())
47912 return SDValue();
47913 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
47914 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
47915 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
47916 cast<ShuffleVectorSDNode>(Op)->getMask());
47917 break;
47918 }
47919 case ISD::INSERT_VECTOR_ELT: {
47920 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
47921 // -V, INDEX).
47922 SDValue InsVector = Op.getOperand(0);
47923 SDValue InsVal = Op.getOperand(1);
47924 if (!InsVector.isUndef())
47925 return SDValue();
47926 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
47927 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
47928 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
47929 NegInsVal, Op.getOperand(2));
47930 break;
47931 }
47932 case ISD::FSUB:
47933 case ISD::XOR:
47934 case X86ISD::FXOR: {
47935 SDValue Op1 = Op.getOperand(1);
47936 SDValue Op0 = Op.getOperand(0);
47937
47938 // For XOR and FXOR, we want to check if constant
47939 // bits of Op1 are sign bit masks. For FSUB, we
47940 // have to check if constant bits of Op0 are sign
47941 // bit masks and hence we swap the operands.
47942 if (Opc == ISD::FSUB)
47943 std::swap(Op0, Op1);
47944
47945 APInt UndefElts;
47946 SmallVector<APInt, 16> EltBits;
47947 // Extract constant bits and see if they are all
47948 // sign bit masks. Ignore the undef elements.
47949 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
47950 /* AllowWholeUndefs */ true,
47951 /* AllowPartialUndefs */ false)) {
47952 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
47953 if (!UndefElts[I] && !EltBits[I].isSignMask())
47954 return SDValue();
47955
47956 return peekThroughBitcasts(Op0);
47957 }
47958 }
47959 }
47960
47961 return SDValue();
47962}
47963
47964static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
47965 bool NegRes) {
47966 if (NegMul) {
47967 switch (Opcode) {
47968 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47968)
;
47969 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
47970 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
47971 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
47972 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
47973 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
47974 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
47975 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
47976 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
47977 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
47978 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
47979 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
47980 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
47981 }
47982 }
47983
47984 if (NegAcc) {
47985 switch (Opcode) {
47986 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47986)
;
47987 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
47988 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
47989 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
47990 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
47991 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
47992 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
47993 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
47994 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
47995 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
47996 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
47997 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
47998 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
47999 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
48000 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
48001 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
48002 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
48003 }
48004 }
48005
48006 if (NegRes) {
48007 switch (Opcode) {
48008 // For accuracy reason, we never combine fneg and fma under strict FP.
48009 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48009)
;
48010 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
48011 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
48012 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
48013 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
48014 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
48015 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
48016 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
48017 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
48018 }
48019 }
48020
48021 return Opcode;
48022}
48023
48024/// Do target-specific dag combines on floating point negations.
48025static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
48026 TargetLowering::DAGCombinerInfo &DCI,
48027 const X86Subtarget &Subtarget) {
48028 EVT OrigVT = N->getValueType(0);
48029 SDValue Arg = isFNEG(DAG, N);
48030 if (!Arg)
48031 return SDValue();
48032
48033 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48034 EVT VT = Arg.getValueType();
48035 EVT SVT = VT.getScalarType();
48036 SDLoc DL(N);
48037
48038 // Let legalize expand this if it isn't a legal type yet.
48039 if (!TLI.isTypeLegal(VT))
48040 return SDValue();
48041
48042 // If we're negating a FMUL node on a target with FMA, then we can avoid the
48043 // use of a constant by performing (-0 - A*B) instead.
48044 // FIXME: Check rounding control flags as well once it becomes available.
48045 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
48046 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
48047 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
48048 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
48049 Arg.getOperand(1), Zero);
48050 return DAG.getBitcast(OrigVT, NewNode);
48051 }
48052
48053 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
48054 bool LegalOperations = !DCI.isBeforeLegalizeOps();
48055 if (SDValue NegArg =
48056 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
48057 return DAG.getBitcast(OrigVT, NegArg);
48058
48059 return SDValue();
48060}
48061
48062SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
48063 bool LegalOperations,
48064 bool ForCodeSize,
48065 NegatibleCost &Cost,
48066 unsigned Depth) const {
48067 // fneg patterns are removable even if they have multiple uses.
48068 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
48069 Cost = NegatibleCost::Cheaper;
48070 return DAG.getBitcast(Op.getValueType(), Arg);
48071 }
48072
48073 EVT VT = Op.getValueType();
48074 EVT SVT = VT.getScalarType();
48075 unsigned Opc = Op.getOpcode();
48076 SDNodeFlags Flags = Op.getNode()->getFlags();
48077 switch (Opc) {
48078 case ISD::FMA:
48079 case X86ISD::FMSUB:
48080 case X86ISD::FNMADD:
48081 case X86ISD::FNMSUB:
48082 case X86ISD::FMADD_RND:
48083 case X86ISD::FMSUB_RND:
48084 case X86ISD::FNMADD_RND:
48085 case X86ISD::FNMSUB_RND: {
48086 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
48087 !(SVT == MVT::f32 || SVT == MVT::f64) ||
48088 !isOperationLegal(ISD::FMA, VT))
48089 break;
48090
48091 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
48092 // if it may have signed zeros.
48093 if (!Flags.hasNoSignedZeros())
48094 break;
48095
48096 // This is always negatible for free but we might be able to remove some
48097 // extra operand negations as well.
48098 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
48099 for (int i = 0; i != 3; ++i)
48100 NewOps[i] = getCheaperNegatedExpression(
48101 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
48102
48103 bool NegA = !!NewOps[0];
48104 bool NegB = !!NewOps[1];
48105 bool NegC = !!NewOps[2];
48106 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
48107
48108 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
48109 : NegatibleCost::Neutral;
48110
48111 // Fill in the non-negated ops with the original values.
48112 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
48113 if (!NewOps[i])
48114 NewOps[i] = Op.getOperand(i);
48115 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
48116 }
48117 case X86ISD::FRCP:
48118 if (SDValue NegOp0 =
48119 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
48120 ForCodeSize, Cost, Depth + 1))
48121 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
48122 break;
48123 }
48124
48125 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
48126 ForCodeSize, Cost, Depth);
48127}
48128
48129static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
48130 const X86Subtarget &Subtarget) {
48131 MVT VT = N->getSimpleValueType(0);
48132 // If we have integer vector types available, use the integer opcodes.
48133 if (!VT.isVector() || !Subtarget.hasSSE2())
48134 return SDValue();
48135
48136 SDLoc dl(N);
48137
48138 unsigned IntBits = VT.getScalarSizeInBits();
48139 MVT IntSVT = MVT::getIntegerVT(IntBits);
48140 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
48141
48142 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
48143 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
48144 unsigned IntOpcode;
48145 switch (N->getOpcode()) {
48146 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48146)
;
48147 case X86ISD::FOR: IntOpcode = ISD::OR; break;
48148 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
48149 case X86ISD::FAND: IntOpcode = ISD::AND; break;
48150 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
48151 }
48152 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
48153 return DAG.getBitcast(VT, IntOp);
48154}
48155
48156
48157/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
48158static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
48159 if (N->getOpcode() != ISD::XOR)
48160 return SDValue();
48161
48162 SDValue LHS = N->getOperand(0);
48163 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
48164 return SDValue();
48165
48166 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
48167 X86::CondCode(LHS->getConstantOperandVal(0)));
48168 SDLoc DL(N);
48169 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
48170}
48171
48172static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
48173 TargetLowering::DAGCombinerInfo &DCI,
48174 const X86Subtarget &Subtarget) {
48175 SDValue N0 = N->getOperand(0);
48176 SDValue N1 = N->getOperand(1);
48177 EVT VT = N->getValueType(0);
48178
48179 // If this is SSE1 only convert to FXOR to avoid scalarization.
48180 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
48181 return DAG.getBitcast(MVT::v4i32,
48182 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
48183 DAG.getBitcast(MVT::v4f32, N0),
48184 DAG.getBitcast(MVT::v4f32, N1)));
48185 }
48186
48187 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
48188 return Cmp;
48189
48190 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
48191 return R;
48192
48193 if (DCI.isBeforeLegalizeOps())
48194 return SDValue();
48195
48196 if (SDValue SetCC = foldXor1SetCC(N, DAG))
48197 return SetCC;
48198
48199 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
48200 return RV;
48201
48202 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
48203 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48204 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
48205 N0.getOperand(0).getValueType().isVector() &&
48206 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
48207 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
48208 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
48209 N0.getOperand(0).getValueType()));
48210 }
48211
48212 // Handle AVX512 mask widening.
48213 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
48214 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
48215 VT.getVectorElementType() == MVT::i1 &&
48216 N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
48217 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
48218 return DAG.getNode(
48219 ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
48220 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
48221 N0.getOperand(2));
48222 }
48223
48224 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
48225 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
48226 // TODO: Under what circumstances could this be performed in DAGCombine?
48227 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
48228 N0.getOperand(0).getOpcode() == N->getOpcode()) {
48229 SDValue TruncExtSrc = N0.getOperand(0);
48230 auto *N1C = dyn_cast<ConstantSDNode>(N1);
48231 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
48232 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
48233 SDLoc DL(N);
48234 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
48235 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
48236 return DAG.getNode(ISD::XOR, DL, VT, LHS,
48237 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
48238 }
48239 }
48240
48241 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
48242 return FPLogic;
48243
48244 return combineFneg(N, DAG, DCI, Subtarget);
48245}
48246
48247static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
48248 TargetLowering::DAGCombinerInfo &DCI,
48249 const X86Subtarget &Subtarget) {
48250 EVT VT = N->getValueType(0);
48251 unsigned NumBits = VT.getSizeInBits();
48252
48253 // TODO - Constant Folding.
48254
48255 // Simplify the inputs.
48256 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48257 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
48258 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
48259 return SDValue(N, 0);
48260
48261 return SDValue();
48262}
48263
48264static bool isNullFPScalarOrVectorConst(SDValue V) {
48265 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
48266}
48267
48268/// If a value is a scalar FP zero or a vector FP zero (potentially including
48269/// undefined elements), return a zero constant that may be used to fold away
48270/// that value. In the case of a vector, the returned constant will not contain
48271/// undefined elements even if the input parameter does. This makes it suitable
48272/// to be used as a replacement operand with operations (eg, bitwise-and) where
48273/// an undef should not propagate.
48274static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
48275 const X86Subtarget &Subtarget) {
48276 if (!isNullFPScalarOrVectorConst(V))
48277 return SDValue();
48278
48279 if (V.getValueType().isVector())
48280 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
48281
48282 return V;
48283}
48284
48285static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
48286 const X86Subtarget &Subtarget) {
48287 SDValue N0 = N->getOperand(0);
48288 SDValue N1 = N->getOperand(1);
48289 EVT VT = N->getValueType(0);
48290 SDLoc DL(N);
48291
48292 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
48293 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
48294 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
48295 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
48296 return SDValue();
48297
48298 auto isAllOnesConstantFP = [](SDValue V) {
48299 if (V.getSimpleValueType().isVector())
48300 return ISD::isBuildVectorAllOnes(V.getNode());
48301 auto *C = dyn_cast<ConstantFPSDNode>(V);
48302 return C && C->getConstantFPValue()->isAllOnesValue();
48303 };
48304
48305 // fand (fxor X, -1), Y --> fandn X, Y
48306 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
48307 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
48308
48309 // fand X, (fxor Y, -1) --> fandn Y, X
48310 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
48311 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
48312
48313 return SDValue();
48314}
48315
48316/// Do target-specific dag combines on X86ISD::FAND nodes.
48317static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
48318 const X86Subtarget &Subtarget) {
48319 // FAND(0.0, x) -> 0.0
48320 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
48321 return V;
48322
48323 // FAND(x, 0.0) -> 0.0
48324 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
48325 return V;
48326
48327 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
48328 return V;
48329
48330 return lowerX86FPLogicOp(N, DAG, Subtarget);
48331}
48332
48333/// Do target-specific dag combines on X86ISD::FANDN nodes.
48334static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
48335 const X86Subtarget &Subtarget) {
48336 // FANDN(0.0, x) -> x
48337 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
48338 return N->getOperand(1);
48339
48340 // FANDN(x, 0.0) -> 0.0
48341 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
48342 return V;
48343
48344 return lowerX86FPLogicOp(N, DAG, Subtarget);
48345}
48346
48347/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
48348static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
48349 TargetLowering::DAGCombinerInfo &DCI,
48350 const X86Subtarget &Subtarget) {
48351 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48351, __extension__ __PRETTY_FUNCTION__))
;
48352
48353 // F[X]OR(0.0, x) -> x
48354 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
48355 return N->getOperand(1);
48356
48357 // F[X]OR(x, 0.0) -> x
48358 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
48359 return N->getOperand(0);
48360
48361 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
48362 return NewVal;
48363
48364 return lowerX86FPLogicOp(N, DAG, Subtarget);
48365}
48366
48367/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
48368static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
48369 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48369, __extension__ __PRETTY_FUNCTION__))
;
48370
48371 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
48372 if (!DAG.getTarget().Options.NoNaNsFPMath ||
48373 !DAG.getTarget().Options.NoSignedZerosFPMath)
48374 return SDValue();
48375
48376 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
48377 // into FMINC and FMAXC, which are Commutative operations.
48378 unsigned NewOp = 0;
48379 switch (N->getOpcode()) {
48380 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48380)
;
48381 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
48382 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
48383 }
48384
48385 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
48386 N->getOperand(0), N->getOperand(1));
48387}
48388
48389static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
48390 const X86Subtarget &Subtarget) {
48391 if (Subtarget.useSoftFloat())
48392 return SDValue();
48393
48394 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48395
48396 EVT VT = N->getValueType(0);
48397 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
48398 (Subtarget.hasSSE2() && VT == MVT::f64) ||
48399 (Subtarget.hasFP16() && VT == MVT::f16) ||
48400 (VT.isVector() && TLI.isTypeLegal(VT))))
48401 return SDValue();
48402
48403 SDValue Op0 = N->getOperand(0);
48404 SDValue Op1 = N->getOperand(1);
48405 SDLoc DL(N);
48406 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
48407
48408 // If we don't have to respect NaN inputs, this is a direct translation to x86
48409 // min/max instructions.
48410 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
48411 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
48412
48413 // If one of the operands is known non-NaN use the native min/max instructions
48414 // with the non-NaN input as second operand.
48415 if (DAG.isKnownNeverNaN(Op1))
48416 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
48417 if (DAG.isKnownNeverNaN(Op0))
48418 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
48419
48420 // If we have to respect NaN inputs, this takes at least 3 instructions.
48421 // Favor a library call when operating on a scalar and minimizing code size.
48422 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
48423 return SDValue();
48424
48425 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
48426 VT);
48427
48428 // There are 4 possibilities involving NaN inputs, and these are the required
48429 // outputs:
48430 // Op1
48431 // Num NaN
48432 // ----------------
48433 // Num | Max | Op0 |
48434 // Op0 ----------------
48435 // NaN | Op1 | NaN |
48436 // ----------------
48437 //
48438 // The SSE FP max/min instructions were not designed for this case, but rather
48439 // to implement:
48440 // Min = Op1 < Op0 ? Op1 : Op0
48441 // Max = Op1 > Op0 ? Op1 : Op0
48442 //
48443 // So they always return Op0 if either input is a NaN. However, we can still
48444 // use those instructions for fmaxnum by selecting away a NaN input.
48445
48446 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
48447 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
48448 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
48449
48450 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
48451 // are NaN, the NaN value of Op1 is the result.
48452 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
48453}
48454
48455static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
48456 TargetLowering::DAGCombinerInfo &DCI) {
48457 EVT VT = N->getValueType(0);
48458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48459
48460 APInt KnownUndef, KnownZero;
48461 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
48462 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
48463 KnownZero, DCI))
48464 return SDValue(N, 0);
48465
48466 // Convert a full vector load into vzload when not all bits are needed.
48467 SDValue In = N->getOperand(0);
48468 MVT InVT = In.getSimpleValueType();
48469 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
48470 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
48471 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48471, __extension__ __PRETTY_FUNCTION__))
;
48472 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
48473 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
48474 MVT MemVT = MVT::getIntegerVT(NumBits);
48475 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
48476 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
48477 SDLoc dl(N);
48478 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
48479 DAG.getBitcast(InVT, VZLoad));
48480 DCI.CombineTo(N, Convert);
48481 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
48482 DCI.recursivelyDeleteUnusedNodes(LN);
48483 return SDValue(N, 0);
48484 }
48485 }
48486
48487 return SDValue();
48488}
48489
48490static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
48491 TargetLowering::DAGCombinerInfo &DCI) {
48492 bool IsStrict = N->isTargetStrictFPOpcode();
48493 EVT VT = N->getValueType(0);
48494
48495 // Convert a full vector load into vzload when not all bits are needed.
48496 SDValue In = N->getOperand(IsStrict ? 1 : 0);
48497 MVT InVT = In.getSimpleValueType();
48498 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
48499 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
48500 assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48500, __extension__ __PRETTY_FUNCTION__))
;
48501 LoadSDNode *LN = cast<LoadSDNode>(In);
48502 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
48503 MVT MemVT = MVT::getFloatingPointVT(NumBits);
48504 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
48505 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
48506 SDLoc dl(N);
48507 if (IsStrict) {
48508 SDValue Convert =
48509 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
48510 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
48511 DCI.CombineTo(N, Convert, Convert.getValue(1));
48512 } else {
48513 SDValue Convert =
48514 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
48515 DCI.CombineTo(N, Convert);
48516 }
48517 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
48518 DCI.recursivelyDeleteUnusedNodes(LN);
48519 return SDValue(N, 0);
48520 }
48521 }
48522
48523 return SDValue();
48524}
48525
48526/// Do target-specific dag combines on X86ISD::ANDNP nodes.
48527static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
48528 TargetLowering::DAGCombinerInfo &DCI,
48529 const X86Subtarget &Subtarget) {
48530 MVT VT = N->getSimpleValueType(0);
48531
48532 // ANDNP(0, x) -> x
48533 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
48534 return N->getOperand(1);
48535
48536 // ANDNP(x, 0) -> 0
48537 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
48538 return DAG.getConstant(0, SDLoc(N), VT);
48539
48540 // Turn ANDNP back to AND if input is inverted.
48541 if (SDValue Not = IsNOT(N->getOperand(0), DAG))
48542 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
48543 N->getOperand(1));
48544
48545 // Attempt to recursively combine a bitmask ANDNP with shuffles.
48546 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
48547 SDValue Op(N, 0);
48548 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48549 return Res;
48550 }
48551
48552 return SDValue();
48553}
48554
48555static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
48556 TargetLowering::DAGCombinerInfo &DCI) {
48557 SDValue N1 = N->getOperand(1);
48558
48559 // BT ignores high bits in the bit index operand.
48560 unsigned BitWidth = N1.getValueSizeInBits();
48561 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
48562 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
48563 if (N->getOpcode() != ISD::DELETED_NODE)
48564 DCI.AddToWorklist(N);
48565 return SDValue(N, 0);
48566 }
48567
48568 return SDValue();
48569}
48570
48571static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
48572 TargetLowering::DAGCombinerInfo &DCI) {
48573 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
48574 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
48575
48576 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
48577 APInt KnownUndef, KnownZero;
48578 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48579 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
48580 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
48581 DCI)) {
48582 if (N->getOpcode() != ISD::DELETED_NODE)
48583 DCI.AddToWorklist(N);
48584 return SDValue(N, 0);
48585 }
48586
48587 // Convert a full vector load into vzload when not all bits are needed.
48588 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
48589 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
48590 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
48591 SDLoc dl(N);
48592 if (IsStrict) {
48593 SDValue Convert = DAG.getNode(
48594 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
48595 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
48596 DCI.CombineTo(N, Convert, Convert.getValue(1));
48597 } else {
48598 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
48599 DAG.getBitcast(MVT::v8i16, VZLoad));
48600 DCI.CombineTo(N, Convert);
48601 }
48602
48603 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
48604 DCI.recursivelyDeleteUnusedNodes(LN);
48605 return SDValue(N, 0);
48606 }
48607 }
48608 }
48609
48610 return SDValue();
48611}
48612
48613// Try to combine sext_in_reg of a cmov of constants by extending the constants.
48614static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
48615 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48615, __extension__ __PRETTY_FUNCTION__))
;
48616
48617 EVT DstVT = N->getValueType(0);
48618
48619 SDValue N0 = N->getOperand(0);
48620 SDValue N1 = N->getOperand(1);
48621 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
48622
48623 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
48624 return SDValue();
48625
48626 // Look through single use any_extends / truncs.
48627 SDValue IntermediateBitwidthOp;
48628 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
48629 N0.hasOneUse()) {
48630 IntermediateBitwidthOp = N0;
48631 N0 = N0.getOperand(0);
48632 }
48633
48634 // See if we have a single use cmov.
48635 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
48636 return SDValue();
48637
48638 SDValue CMovOp0 = N0.getOperand(0);
48639 SDValue CMovOp1 = N0.getOperand(1);
48640
48641 // Make sure both operands are constants.
48642 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
48643 !isa<ConstantSDNode>(CMovOp1.getNode()))
48644 return SDValue();
48645
48646 SDLoc DL(N);
48647
48648 // If we looked through an any_extend/trunc above, add one to the constants.
48649 if (IntermediateBitwidthOp) {
48650 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
48651 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
48652 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
48653 }
48654
48655 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
48656 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
48657
48658 EVT CMovVT = DstVT;
48659 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
48660 if (DstVT == MVT::i16) {
48661 CMovVT = MVT::i32;
48662 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
48663 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
48664 }
48665
48666 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
48667 N0.getOperand(2), N0.getOperand(3));
48668
48669 if (CMovVT != DstVT)
48670 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
48671
48672 return CMov;
48673}
48674
48675static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
48676 const X86Subtarget &Subtarget) {
48677 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48677, __extension__ __PRETTY_FUNCTION__))
;
48678
48679 if (SDValue V = combineSextInRegCmov(N, DAG))
48680 return V;
48681
48682 EVT VT = N->getValueType(0);
48683 SDValue N0 = N->getOperand(0);
48684 SDValue N1 = N->getOperand(1);
48685 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
48686 SDLoc dl(N);
48687
48688 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
48689 // both SSE and AVX2 since there is no sign-extended shift right
48690 // operation on a vector with 64-bit elements.
48691 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
48692 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
48693 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
48694 N0.getOpcode() == ISD::SIGN_EXTEND)) {
48695 SDValue N00 = N0.getOperand(0);
48696
48697 // EXTLOAD has a better solution on AVX2,
48698 // it may be replaced with X86ISD::VSEXT node.
48699 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
48700 if (!ISD::isNormalLoad(N00.getNode()))
48701 return SDValue();
48702
48703 // Attempt to promote any comparison mask ops before moving the
48704 // SIGN_EXTEND_INREG in the way.
48705 if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
48706 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
48707
48708 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
48709 SDValue Tmp =
48710 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
48711 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
48712 }
48713 }
48714 return SDValue();
48715}
48716
48717/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
48718/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
48719/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
48720/// opportunities to combine math ops, use an LEA, or use a complex addressing
48721/// mode. This can eliminate extend, add, and shift instructions.
48722static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
48723 const X86Subtarget &Subtarget) {
48724 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
48725 Ext->getOpcode() != ISD::ZERO_EXTEND)
48726 return SDValue();
48727
48728 // TODO: This should be valid for other integer types.
48729 EVT VT = Ext->getValueType(0);
48730 if (VT != MVT::i64)
48731 return SDValue();
48732
48733 SDValue Add = Ext->getOperand(0);
48734 if (Add.getOpcode() != ISD::ADD)
48735 return SDValue();
48736
48737 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
48738 bool NSW = Add->getFlags().hasNoSignedWrap();
48739 bool NUW = Add->getFlags().hasNoUnsignedWrap();
48740
48741 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
48742 // into the 'zext'
48743 if ((Sext && !NSW) || (!Sext && !NUW))
48744 return SDValue();
48745
48746 // Having a constant operand to the 'add' ensures that we are not increasing
48747 // the instruction count because the constant is extended for free below.
48748 // A constant operand can also become the displacement field of an LEA.
48749 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
48750 if (!AddOp1)
48751 return SDValue();
48752
48753 // Don't make the 'add' bigger if there's no hope of combining it with some
48754 // other 'add' or 'shl' instruction.
48755 // TODO: It may be profitable to generate simpler LEA instructions in place
48756 // of single 'add' instructions, but the cost model for selecting an LEA
48757 // currently has a high threshold.
48758 bool HasLEAPotential = false;
48759 for (auto *User : Ext->uses()) {
48760 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
48761 HasLEAPotential = true;
48762 break;
48763 }
48764 }
48765 if (!HasLEAPotential)
48766 return SDValue();
48767
48768 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
48769 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
48770 SDValue AddOp0 = Add.getOperand(0);
48771 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
48772 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
48773
48774 // The wider add is guaranteed to not wrap because both operands are
48775 // sign-extended.
48776 SDNodeFlags Flags;
48777 Flags.setNoSignedWrap(NSW);
48778 Flags.setNoUnsignedWrap(NUW);
48779 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
48780}
48781
48782// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
48783// operands and the result of CMOV is not used anywhere else - promote CMOV
48784// itself instead of promoting its result. This could be beneficial, because:
48785// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
48786// (or more) pseudo-CMOVs only when they go one-after-another and
48787// getting rid of result extension code after CMOV will help that.
48788// 2) Promotion of constant CMOV arguments is free, hence the
48789// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
48790// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
48791// promotion is also good in terms of code-size.
48792// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
48793// promotion).
48794static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
48795 SDValue CMovN = Extend->getOperand(0);
48796 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
48797 return SDValue();
48798
48799 EVT TargetVT = Extend->getValueType(0);
48800 unsigned ExtendOpcode = Extend->getOpcode();
48801 SDLoc DL(Extend);
48802
48803 EVT VT = CMovN.getValueType();
48804 SDValue CMovOp0 = CMovN.getOperand(0);
48805 SDValue CMovOp1 = CMovN.getOperand(1);
48806
48807 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
48808 !isa<ConstantSDNode>(CMovOp1.getNode()))
48809 return SDValue();
48810
48811 // Only extend to i32 or i64.
48812 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
48813 return SDValue();
48814
48815 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
48816 // are free.
48817 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
48818 return SDValue();
48819
48820 // If this a zero extend to i64, we should only extend to i32 and use a free
48821 // zero extend to finish.
48822 EVT ExtendVT = TargetVT;
48823 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
48824 ExtendVT = MVT::i32;
48825
48826 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
48827 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
48828
48829 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
48830 CMovN.getOperand(2), CMovN.getOperand(3));
48831
48832 // Finish extending if needed.
48833 if (ExtendVT != TargetVT)
48834 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
48835
48836 return Res;
48837}
48838
48839// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
48840// This is more or less the reverse of combineBitcastvxi1.
48841static SDValue
48842combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
48843 TargetLowering::DAGCombinerInfo &DCI,
48844 const X86Subtarget &Subtarget) {
48845 unsigned Opcode = N->getOpcode();
48846 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
48847 Opcode != ISD::ANY_EXTEND)
48848 return SDValue();
48849 if (!DCI.isBeforeLegalizeOps())
48850 return SDValue();
48851 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
48852 return SDValue();
48853
48854 SDValue N0 = N->getOperand(0);
48855 EVT VT = N->getValueType(0);
48856 EVT SVT = VT.getScalarType();
48857 EVT InSVT = N0.getValueType().getScalarType();
48858 unsigned EltSizeInBits = SVT.getSizeInBits();
48859
48860 // Input type must be extending a bool vector (bit-casted from a scalar
48861 // integer) to legal integer types.
48862 if (!VT.isVector())
48863 return SDValue();
48864 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
48865 return SDValue();
48866 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
48867 return SDValue();
48868
48869 SDValue N00 = N0.getOperand(0);
48870 EVT SclVT = N0.getOperand(0).getValueType();
48871 if (!SclVT.isScalarInteger())
48872 return SDValue();
48873
48874 SDLoc DL(N);
48875 SDValue Vec;
48876 SmallVector<int, 32> ShuffleMask;
48877 unsigned NumElts = VT.getVectorNumElements();
48878 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48878, __extension__ __PRETTY_FUNCTION__))
;
48879
48880 // Broadcast the scalar integer to the vector elements.
48881 if (NumElts > EltSizeInBits) {
48882 // If the scalar integer is greater than the vector element size, then we
48883 // must split it down into sub-sections for broadcasting. For example:
48884 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
48885 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
48886 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48886, __extension__ __PRETTY_FUNCTION__))
;
48887 unsigned Scale = NumElts / EltSizeInBits;
48888 EVT BroadcastVT =
48889 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
48890 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
48891 Vec = DAG.getBitcast(VT, Vec);
48892
48893 for (unsigned i = 0; i != Scale; ++i)
48894 ShuffleMask.append(EltSizeInBits, i);
48895 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
48896 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
48897 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
48898 // If we have register broadcast instructions, use the scalar size as the
48899 // element type for the shuffle. Then cast to the wider element type. The
48900 // widened bits won't be used, and this might allow the use of a broadcast
48901 // load.
48902 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48902, __extension__ __PRETTY_FUNCTION__))
;
48903 unsigned Scale = EltSizeInBits / NumElts;
48904 EVT BroadcastVT =
48905 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
48906 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
48907 ShuffleMask.append(NumElts * Scale, 0);
48908 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
48909 Vec = DAG.getBitcast(VT, Vec);
48910 } else {
48911 // For smaller scalar integers, we can simply any-extend it to the vector
48912 // element size (we don't care about the upper bits) and broadcast it to all
48913 // elements.
48914 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
48915 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
48916 ShuffleMask.append(NumElts, 0);
48917 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
48918 }
48919
48920 // Now, mask the relevant bit in each element.
48921 SmallVector<SDValue, 32> Bits;
48922 for (unsigned i = 0; i != NumElts; ++i) {
48923 int BitIdx = (i % EltSizeInBits);
48924 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
48925 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
48926 }
48927 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
48928 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
48929
48930 // Compare against the bitmask and extend the result.
48931 EVT CCVT = VT.changeVectorElementType(MVT::i1);
48932 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
48933 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
48934
48935 // For SEXT, this is now done, otherwise shift the result down for
48936 // zero-extension.
48937 if (Opcode == ISD::SIGN_EXTEND)
48938 return Vec;
48939 return DAG.getNode(ISD::SRL, DL, VT, Vec,
48940 DAG.getConstant(EltSizeInBits - 1, DL, VT));
48941}
48942
48943// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
48944// result type.
48945static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
48946 const X86Subtarget &Subtarget) {
48947 SDValue N0 = N->getOperand(0);
48948 EVT VT = N->getValueType(0);
48949 SDLoc dl(N);
48950
48951 // Only do this combine with AVX512 for vector extends.
48952 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
48953 return SDValue();
48954
48955 // Only combine legal element types.
48956 EVT SVT = VT.getVectorElementType();
48957 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
48958 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
48959 return SDValue();
48960
48961 // We don't have CMPP Instruction for vxf16
48962 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
48963 return SDValue();
48964 // We can only do this if the vector size in 256 bits or less.
48965 unsigned Size = VT.getSizeInBits();
48966 if (Size > 256 && Subtarget.useAVX512Regs())
48967 return SDValue();
48968
48969 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
48970 // that's the only integer compares with we have.
48971 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
48972 if (ISD::isUnsignedIntSetCC(CC))
48973 return SDValue();
48974
48975 // Only do this combine if the extension will be fully consumed by the setcc.
48976 EVT N00VT = N0.getOperand(0).getValueType();
48977 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
48978 if (Size != MatchingVecType.getSizeInBits())
48979 return SDValue();
48980
48981 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
48982
48983 if (N->getOpcode() == ISD::ZERO_EXTEND)
48984 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
48985
48986 return Res;
48987}
48988
48989static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
48990 TargetLowering::DAGCombinerInfo &DCI,
48991 const X86Subtarget &Subtarget) {
48992 SDValue N0 = N->getOperand(0);
48993 EVT VT = N->getValueType(0);
48994 SDLoc DL(N);
48995
48996 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
48997 if (!DCI.isBeforeLegalizeOps() &&
48998 N0.getOpcode() == X86ISD::SETCC_CARRY) {
48999 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
49000 N0->getOperand(1));
49001 bool ReplaceOtherUses = !N0.hasOneUse();
49002 DCI.CombineTo(N, Setcc);
49003 // Replace other uses with a truncate of the widened setcc_carry.
49004 if (ReplaceOtherUses) {
49005 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
49006 N0.getValueType(), Setcc);
49007 DCI.CombineTo(N0.getNode(), Trunc);
49008 }
49009
49010 return SDValue(N, 0);
49011 }
49012
49013 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
49014 return NewCMov;
49015
49016 if (!DCI.isBeforeLegalizeOps())
49017 return SDValue();
49018
49019 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
49020 return V;
49021
49022 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
49023 return V;
49024
49025 if (VT.isVector()) {
49026 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
49027 return R;
49028
49029 if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
49030 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
49031 }
49032
49033 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
49034 return NewAdd;
49035
49036 return SDValue();
49037}
49038
49039static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
49040 TargetLowering::DAGCombinerInfo &DCI,
49041 const X86Subtarget &Subtarget) {
49042 SDLoc dl(N);
49043 EVT VT = N->getValueType(0);
49044 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
49045
49046 // Let legalize expand this if it isn't a legal type yet.
49047 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49048 if (!TLI.isTypeLegal(VT))
49049 return SDValue();
49050
49051 SDValue A = N->getOperand(IsStrict ? 1 : 0);
49052 SDValue B = N->getOperand(IsStrict ? 2 : 1);
49053 SDValue C = N->getOperand(IsStrict ? 3 : 2);
49054
49055 // If the operation allows fast-math and the target does not support FMA,
49056 // split this into mul+add to avoid libcall(s).
49057 SDNodeFlags Flags = N->getFlags();
49058 if (!IsStrict && Flags.hasAllowReassociation() &&
49059 TLI.isOperationExpand(ISD::FMA, VT)) {
49060 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
49061 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
49062 }
49063
49064 EVT ScalarVT = VT.getScalarType();
49065 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
49066 !Subtarget.hasAnyFMA()) &&
49067 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
49068 return SDValue();
49069
49070 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
49071 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
49072 bool LegalOperations = !DCI.isBeforeLegalizeOps();
49073 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
49074 CodeSize)) {
49075 V = NegV;
49076 return true;
49077 }
49078 // Look through extract_vector_elts. If it comes from an FNEG, create a
49079 // new extract from the FNEG input.
49080 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
49081 isNullConstant(V.getOperand(1))) {
49082 SDValue Vec = V.getOperand(0);
49083 if (SDValue NegV = TLI.getCheaperNegatedExpression(
49084 Vec, DAG, LegalOperations, CodeSize)) {
49085 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
49086 NegV, V.getOperand(1));
49087 return true;
49088 }
49089 }
49090
49091 return false;
49092 };
49093
49094 // Do not convert the passthru input of scalar intrinsics.
49095 // FIXME: We could allow negations of the lower element only.
49096 bool NegA = invertIfNegative(A);
49097 bool NegB = invertIfNegative(B);
49098 bool NegC = invertIfNegative(C);
49099
49100 if (!NegA && !NegB && !NegC)
49101 return SDValue();
49102
49103 unsigned NewOpcode =
49104 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
49105
49106 // Propagate fast-math-flags to new FMA node.
49107 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
49108 if (IsStrict) {
49109 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49109, __extension__ __PRETTY_FUNCTION__))
;
49110 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
49111 {N->getOperand(0), A, B, C});
49112 } else {
49113 if (N->getNumOperands() == 4)
49114 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
49115 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
49116 }
49117}
49118
49119// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
49120// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
49121static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
49122 TargetLowering::DAGCombinerInfo &DCI) {
49123 SDLoc dl(N);
49124 EVT VT = N->getValueType(0);
49125 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49126 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
49127 bool LegalOperations = !DCI.isBeforeLegalizeOps();
49128
49129 SDValue N2 = N->getOperand(2);
49130
49131 SDValue NegN2 =
49132 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
49133 if (!NegN2)
49134 return SDValue();
49135 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
49136
49137 if (N->getNumOperands() == 4)
49138 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
49139 NegN2, N->getOperand(3));
49140 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
49141 NegN2);
49142}
49143
49144static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
49145 TargetLowering::DAGCombinerInfo &DCI,
49146 const X86Subtarget &Subtarget) {
49147 SDLoc dl(N);
49148 SDValue N0 = N->getOperand(0);
49149 EVT VT = N->getValueType(0);
49150
49151 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
49152 // FIXME: Is this needed? We don't seem to have any tests for it.
49153 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
49154 N0.getOpcode() == X86ISD::SETCC_CARRY) {
49155 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
49156 N0->getOperand(1));
49157 bool ReplaceOtherUses = !N0.hasOneUse();
49158 DCI.CombineTo(N, Setcc);
49159 // Replace other uses with a truncate of the widened setcc_carry.
49160 if (ReplaceOtherUses) {
49161 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
49162 N0.getValueType(), Setcc);
49163 DCI.CombineTo(N0.getNode(), Trunc);
49164 }
49165
49166 return SDValue(N, 0);
49167 }
49168
49169 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
49170 return NewCMov;
49171
49172 if (DCI.isBeforeLegalizeOps())
49173 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
49174 return V;
49175
49176 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
49177 return V;
49178
49179 if (VT.isVector())
49180 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
49181 return R;
49182
49183 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
49184 return NewAdd;
49185
49186 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
49187 return R;
49188
49189 // TODO: Combine with any target/faux shuffle.
49190 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
49191 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
49192 SDValue N00 = N0.getOperand(0);
49193 SDValue N01 = N0.getOperand(1);
49194 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
49195 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
49196 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
49197 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
49198 return concatSubVectors(N00, N01, DAG, dl);
49199 }
49200 }
49201
49202 return SDValue();
49203}
49204
49205/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
49206/// recognizable memcmp expansion.
49207static bool isOrXorXorTree(SDValue X, bool Root = true) {
49208 if (X.getOpcode() == ISD::OR)
49209 return isOrXorXorTree(X.getOperand(0), false) &&
49210 isOrXorXorTree(X.getOperand(1), false);
49211 if (Root)
49212 return false;
49213 return X.getOpcode() == ISD::XOR;
49214}
49215
49216/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
49217/// expansion.
49218template<typename F>
49219static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
49220 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
49221 SDValue Op0 = X.getOperand(0);
49222 SDValue Op1 = X.getOperand(1);
49223 if (X.getOpcode() == ISD::OR) {
49224 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
49225 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
49226 if (VecVT != CmpVT)
49227 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
49228 if (HasPT)
49229 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
49230 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
49231 } else if (X.getOpcode() == ISD::XOR) {
49232 SDValue A = SToV(Op0);
49233 SDValue B = SToV(Op1);
49234 if (VecVT != CmpVT)
49235 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
49236 if (HasPT)
49237 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
49238 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
49239 }
49240 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49240)
;
49241}
49242
49243/// Try to map a 128-bit or larger integer comparison to vector instructions
49244/// before type legalization splits it up into chunks.
49245static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
49246 const X86Subtarget &Subtarget) {
49247 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
49248 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49248, __extension__ __PRETTY_FUNCTION__))
;
49249
49250 // We're looking for an oversized integer equality comparison.
49251 SDValue X = SetCC->getOperand(0);
49252 SDValue Y = SetCC->getOperand(1);
49253 EVT OpVT = X.getValueType();
49254 unsigned OpSize = OpVT.getSizeInBits();
49255 if (!OpVT.isScalarInteger() || OpSize < 128)
49256 return SDValue();
49257
49258 // Ignore a comparison with zero because that gets special treatment in
49259 // EmitTest(). But make an exception for the special case of a pair of
49260 // logically-combined vector-sized operands compared to zero. This pattern may
49261 // be generated by the memcmp expansion pass with oversized integer compares
49262 // (see PR33325).
49263 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
49264 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
49265 return SDValue();
49266
49267 // Don't perform this combine if constructing the vector will be expensive.
49268 auto IsVectorBitCastCheap = [](SDValue X) {
49269 X = peekThroughBitcasts(X);
49270 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
49271 X.getOpcode() == ISD::LOAD;
49272 };
49273 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
49274 !IsOrXorXorTreeCCZero)
49275 return SDValue();
49276
49277 EVT VT = SetCC->getValueType(0);
49278 SDLoc DL(SetCC);
49279
49280 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
49281 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
49282 // Otherwise use PCMPEQ (plus AND) and mask testing.
49283 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
49284 (OpSize == 256 && Subtarget.hasAVX()) ||
49285 (OpSize == 512 && Subtarget.useAVX512Regs())) {
49286 bool HasPT = Subtarget.hasSSE41();
49287
49288 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
49289 // vector registers are essentially free. (Technically, widening registers
49290 // prevents load folding, but the tradeoff is worth it.)
49291 bool PreferKOT = Subtarget.preferMaskRegisters();
49292 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
49293
49294 EVT VecVT = MVT::v16i8;
49295 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
49296 if (OpSize == 256) {
49297 VecVT = MVT::v32i8;
49298 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
49299 }
49300 EVT CastVT = VecVT;
49301 bool NeedsAVX512FCast = false;
49302 if (OpSize == 512 || NeedZExt) {
49303 if (Subtarget.hasBWI()) {
49304 VecVT = MVT::v64i8;
49305 CmpVT = MVT::v64i1;
49306 if (OpSize == 512)
49307 CastVT = VecVT;
49308 } else {
49309 VecVT = MVT::v16i32;
49310 CmpVT = MVT::v16i1;
49311 CastVT = OpSize == 512 ? VecVT :
49312 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
49313 NeedsAVX512FCast = true;
49314 }
49315 }
49316
49317 auto ScalarToVector = [&](SDValue X) -> SDValue {
49318 bool TmpZext = false;
49319 EVT TmpCastVT = CastVT;
49320 if (X.getOpcode() == ISD::ZERO_EXTEND) {
49321 SDValue OrigX = X.getOperand(0);
49322 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
49323 if (OrigSize < OpSize) {
49324 if (OrigSize == 128) {
49325 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
49326 X = OrigX;
49327 TmpZext = true;
49328 } else if (OrigSize == 256) {
49329 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
49330 X = OrigX;
49331 TmpZext = true;
49332 }
49333 }
49334 }
49335 X = DAG.getBitcast(TmpCastVT, X);
49336 if (!NeedZExt && !TmpZext)
49337 return X;
49338 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
49339 DAG.getConstant(0, DL, VecVT), X,
49340 DAG.getVectorIdxConstant(0, DL));
49341 };
49342
49343 SDValue Cmp;
49344 if (IsOrXorXorTreeCCZero) {
49345 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
49346 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
49347 // Use 2 vector equality compares and 'and' the results before doing a
49348 // MOVMSK.
49349 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
49350 } else {
49351 SDValue VecX = ScalarToVector(X);
49352 SDValue VecY = ScalarToVector(Y);
49353 if (VecVT != CmpVT) {
49354 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
49355 } else if (HasPT) {
49356 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
49357 } else {
49358 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
49359 }
49360 }
49361 // AVX512 should emit a setcc that will lower to kortest.
49362 if (VecVT != CmpVT) {
49363 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
49364 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
49365 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
49366 DAG.getConstant(0, DL, KRegVT), CC);
49367 }
49368 if (HasPT) {
49369 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
49370 Cmp);
49371 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
49372 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
49373 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
49374 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
49375 }
49376 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
49377 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
49378 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
49379 assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49380, __extension__ __PRETTY_FUNCTION__))
49380 "Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49380, __extension__ __PRETTY_FUNCTION__))
;
49381 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
49382 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
49383 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
49384 }
49385
49386 return SDValue();
49387}
49388
49389static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
49390 TargetLowering::DAGCombinerInfo &DCI,
49391 const X86Subtarget &Subtarget) {
49392 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
49393 const SDValue LHS = N->getOperand(0);
49394 const SDValue RHS = N->getOperand(1);
49395 EVT VT = N->getValueType(0);
49396 EVT OpVT = LHS.getValueType();
49397 SDLoc DL(N);
49398
49399 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
49400 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
49401 return V;
49402
49403 if (VT == MVT::i1 && isNullConstant(RHS)) {
49404 SDValue X86CC;
49405 if (SDValue V =
49406 MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
49407 return DAG.getNode(ISD::TRUNCATE, DL, VT,
49408 DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
49409 }
49410
49411 if (OpVT.isScalarInteger()) {
49412 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
49413 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
49414 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
49415 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
49416 if (N0.getOperand(0) == N1)
49417 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
49418 N0.getOperand(1));
49419 if (N0.getOperand(1) == N1)
49420 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
49421 N0.getOperand(0));
49422 }
49423 return SDValue();
49424 };
49425 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
49426 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
49427 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
49428 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
49429
49430 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
49431 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
49432 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
49433 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
49434 if (N0.getOperand(0) == N1)
49435 return DAG.getNode(ISD::AND, DL, OpVT, N1,
49436 DAG.getNOT(DL, N0.getOperand(1), OpVT));
49437 if (N0.getOperand(1) == N1)
49438 return DAG.getNode(ISD::AND, DL, OpVT, N1,
49439 DAG.getNOT(DL, N0.getOperand(0), OpVT));
49440 }
49441 return SDValue();
49442 };
49443 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
49444 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
49445 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
49446 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
49447
49448 // cmpeq(trunc(x),0) --> cmpeq(x,0)
49449 // cmpne(trunc(x),0) --> cmpne(x,0)
49450 // iff x upper bits are zero.
49451 // TODO: Add support for RHS to be truncate as well?
49452 if (LHS.getOpcode() == ISD::TRUNCATE &&
49453 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
49454 isNullConstant(RHS) && !DCI.isBeforeLegalize()) {
49455 EVT SrcVT = LHS.getOperand(0).getValueType();
49456 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
49457 OpVT.getScalarSizeInBits());
49458 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49459 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
49460 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
49461 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
49462 DAG.getConstant(0, DL, SrcVT), CC);
49463 }
49464 }
49465 }
49466
49467 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
49468 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
49469 // Using temporaries to avoid messing up operand ordering for later
49470 // transformations if this doesn't work.
49471 SDValue Op0 = LHS;
49472 SDValue Op1 = RHS;
49473 ISD::CondCode TmpCC = CC;
49474 // Put build_vector on the right.
49475 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
49476 std::swap(Op0, Op1);
49477 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
49478 }
49479
49480 bool IsSEXT0 =
49481 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
49482 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
49483 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
49484
49485 if (IsSEXT0 && IsVZero1) {
49486 assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49487, __extension__ __PRETTY_FUNCTION__))
49487 "Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49487, __extension__ __PRETTY_FUNCTION__))
;
49488 if (TmpCC == ISD::SETGT)
49489 return DAG.getConstant(0, DL, VT);
49490 if (TmpCC == ISD::SETLE)
49491 return DAG.getConstant(1, DL, VT);
49492 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
49493 return DAG.getNOT(DL, Op0.getOperand(0), VT);
49494
49495 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49496, __extension__ __PRETTY_FUNCTION__))
49496 "Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49496, __extension__ __PRETTY_FUNCTION__))
;
49497 return Op0.getOperand(0);
49498 }
49499 }
49500
49501 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
49502 // pre-promote its result type since vXi1 vectors don't get promoted
49503 // during type legalization.
49504 // NOTE: The element count check is to ignore operand types that need to
49505 // go through type promotion to a 128-bit vector.
49506 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
49507 VT.getVectorElementType() == MVT::i1 &&
49508 (OpVT.getVectorElementType() == MVT::i8 ||
49509 OpVT.getVectorElementType() == MVT::i16)) {
49510 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
49511 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
49512 }
49513
49514 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
49515 // to avoid scalarization via legalization because v4i32 is not a legal type.
49516 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
49517 LHS.getValueType() == MVT::v4f32)
49518 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
49519
49520 return SDValue();
49521}
49522
49523static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
49524 TargetLowering::DAGCombinerInfo &DCI,
49525 const X86Subtarget &Subtarget) {
49526 SDValue Src = N->getOperand(0);
49527 MVT SrcVT = Src.getSimpleValueType();
49528 MVT VT = N->getSimpleValueType(0);
49529 unsigned NumBits = VT.getScalarSizeInBits();
49530 unsigned NumElts = SrcVT.getVectorNumElements();
49531
49532 // Perform constant folding.
49533 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
49534 assert(VT == MVT::i32 && "Unexpected result type")(static_cast <bool> (VT == MVT::i32 && "Unexpected result type"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected result type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49534, __extension__ __PRETTY_FUNCTION__))
;
49535 APInt Imm(32, 0);
49536 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
49537 if (!Src.getOperand(Idx).isUndef() &&
49538 Src.getConstantOperandAPInt(Idx).isNegative())
49539 Imm.setBit(Idx);
49540 }
49541 return DAG.getConstant(Imm, SDLoc(N), VT);
49542 }
49543
49544 // Look through int->fp bitcasts that don't change the element width.
49545 unsigned EltWidth = SrcVT.getScalarSizeInBits();
49546 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
49547 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
49548 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
49549
49550 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
49551 // with scalar comparisons.
49552 if (SDValue NotSrc = IsNOT(Src, DAG)) {
49553 SDLoc DL(N);
49554 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
49555 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
49556 return DAG.getNode(ISD::XOR, DL, VT,
49557 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
49558 DAG.getConstant(NotMask, DL, VT));
49559 }
49560
49561 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
49562 // results with scalar comparisons.
49563 if (Src.getOpcode() == X86ISD::PCMPGT &&
49564 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
49565 SDLoc DL(N);
49566 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
49567 return DAG.getNode(ISD::XOR, DL, VT,
49568 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
49569 DAG.getConstant(NotMask, DL, VT));
49570 }
49571
49572 // Simplify the inputs.
49573 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49574 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
49575 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
49576 return SDValue(N, 0);
49577
49578 return SDValue();
49579}
49580
49581static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
49582 TargetLowering::DAGCombinerInfo &DCI) {
49583 // With vector masks we only demand the upper bit of the mask.
49584 SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
49585 if (Mask.getScalarValueSizeInBits() != 1) {
49586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49587 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
49588 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
49589 if (N->getOpcode() != ISD::DELETED_NODE)
49590 DCI.AddToWorklist(N);
49591 return SDValue(N, 0);
49592 }
49593 }
49594
49595 return SDValue();
49596}
49597
49598static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
49599 SDValue Index, SDValue Base, SDValue Scale,
49600 SelectionDAG &DAG) {
49601 SDLoc DL(GorS);
49602
49603 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
49604 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
49605 Gather->getMask(), Base, Index, Scale } ;
49606 return DAG.getMaskedGather(Gather->getVTList(),
49607 Gather->getMemoryVT(), DL, Ops,
49608 Gather->getMemOperand(),
49609 Gather->getIndexType(),
49610 Gather->getExtensionType());
49611 }
49612 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
49613 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
49614 Scatter->getMask(), Base, Index, Scale };
49615 return DAG.getMaskedScatter(Scatter->getVTList(),
49616 Scatter->getMemoryVT(), DL,
49617 Ops, Scatter->getMemOperand(),
49618 Scatter->getIndexType(),
49619 Scatter->isTruncatingStore());
49620}
49621
49622static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
49623 TargetLowering::DAGCombinerInfo &DCI) {
49624 SDLoc DL(N);
49625 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
49626 SDValue Index = GorS->getIndex();
49627 SDValue Base = GorS->getBasePtr();
49628 SDValue Scale = GorS->getScale();
49629
49630 if (DCI.isBeforeLegalize()) {
49631 unsigned IndexWidth = Index.getScalarValueSizeInBits();
49632
49633 // Shrink constant indices if they are larger than 32-bits.
49634 // Only do this before legalize types since v2i64 could become v2i32.
49635 // FIXME: We could check that the type is legal if we're after legalize
49636 // types, but then we would need to construct test cases where that happens.
49637 // FIXME: We could support more than just constant vectors, but we need to
49638 // careful with costing. A truncate that can be optimized out would be fine.
49639 // Otherwise we might only want to create a truncate if it avoids a split.
49640 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
49641 if (BV->isConstant() && IndexWidth > 32 &&
49642 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
49643 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
49644 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
49645 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
49646 }
49647 }
49648
49649 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
49650 // there are sufficient sign bits. Only do this before legalize types to
49651 // avoid creating illegal types in truncate.
49652 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
49653 Index.getOpcode() == ISD::ZERO_EXTEND) &&
49654 IndexWidth > 32 &&
49655 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
49656 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
49657 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
49658 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
49659 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
49660 }
49661 }
49662
49663 if (DCI.isBeforeLegalizeOps()) {
49664 unsigned IndexWidth = Index.getScalarValueSizeInBits();
49665
49666 // Make sure the index is either i32 or i64
49667 if (IndexWidth != 32 && IndexWidth != 64) {
49668 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
49669 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
49670 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
49671 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
49672 }
49673 }
49674
49675 // With vector masks we only demand the upper bit of the mask.
49676 SDValue Mask = GorS->getMask();
49677 if (Mask.getScalarValueSizeInBits() != 1) {
49678 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49679 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
49680 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
49681 if (N->getOpcode() != ISD::DELETED_NODE)
49682 DCI.AddToWorklist(N);
49683 return SDValue(N, 0);
49684 }
49685 }
49686
49687 return SDValue();
49688}
49689
49690// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
49691static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
49692 const X86Subtarget &Subtarget) {
49693 SDLoc DL(N);
49694 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
49695 SDValue EFLAGS = N->getOperand(1);
49696
49697 // Try to simplify the EFLAGS and condition code operands.
49698 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
49699 return getSETCC(CC, Flags, DL, DAG);
49700
49701 return SDValue();
49702}
49703
49704/// Optimize branch condition evaluation.
49705static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
49706 const X86Subtarget &Subtarget) {
49707 SDLoc DL(N);
49708 SDValue EFLAGS = N->getOperand(3);
49709 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
49710
49711 // Try to simplify the EFLAGS and condition code operands.
49712 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
49713 // RAUW them under us.
49714 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
49715 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
49716 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
49717 N->getOperand(1), Cond, Flags);
49718 }
49719
49720 return SDValue();
49721}
49722
49723// TODO: Could we move this to DAGCombine?
49724static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
49725 SelectionDAG &DAG) {
49726 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
49727 // to optimize away operation when it's from a constant.
49728 //
49729 // The general transformation is:
49730 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
49731 // AND(VECTOR_CMP(x,y), constant2)
49732 // constant2 = UNARYOP(constant)
49733
49734 // Early exit if this isn't a vector operation, the operand of the
49735 // unary operation isn't a bitwise AND, or if the sizes of the operations
49736 // aren't the same.
49737 EVT VT = N->getValueType(0);
49738 bool IsStrict = N->isStrictFPOpcode();
49739 unsigned NumEltBits = VT.getScalarSizeInBits();
49740 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49741 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
49742 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
49743 VT.getSizeInBits() != Op0.getValueSizeInBits())
49744 return SDValue();
49745
49746 // Now check that the other operand of the AND is a constant. We could
49747 // make the transformation for non-constant splats as well, but it's unclear
49748 // that would be a benefit as it would not eliminate any operations, just
49749 // perform one more step in scalar code before moving to the vector unit.
49750 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
49751 // Bail out if the vector isn't a constant.
49752 if (!BV->isConstant())
49753 return SDValue();
49754
49755 // Everything checks out. Build up the new and improved node.
49756 SDLoc DL(N);
49757 EVT IntVT = BV->getValueType(0);
49758 // Create a new constant of the appropriate type for the transformed
49759 // DAG.
49760 SDValue SourceConst;
49761 if (IsStrict)
49762 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
49763 {N->getOperand(0), SDValue(BV, 0)});
49764 else
49765 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
49766 // The AND node needs bitcasts to/from an integer vector type around it.
49767 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
49768 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
49769 MaskConst);
49770 SDValue Res = DAG.getBitcast(VT, NewAnd);
49771 if (IsStrict)
49772 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
49773 return Res;
49774 }
49775
49776 return SDValue();
49777}
49778
49779/// If we are converting a value to floating-point, try to replace scalar
49780/// truncate of an extracted vector element with a bitcast. This tries to keep
49781/// the sequence on XMM registers rather than moving between vector and GPRs.
49782static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
49783 // TODO: This is currently only used by combineSIntToFP, but it is generalized
49784 // to allow being called by any similar cast opcode.
49785 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
49786 SDValue Trunc = N->getOperand(0);
49787 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
49788 return SDValue();
49789
49790 SDValue ExtElt = Trunc.getOperand(0);
49791 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
49792 !isNullConstant(ExtElt.getOperand(1)))
49793 return SDValue();
49794
49795 EVT TruncVT = Trunc.getValueType();
49796 EVT SrcVT = ExtElt.getValueType();
49797 unsigned DestWidth = TruncVT.getSizeInBits();
49798 unsigned SrcWidth = SrcVT.getSizeInBits();
49799 if (SrcWidth % DestWidth != 0)
49800 return SDValue();
49801
49802 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
49803 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
49804 unsigned VecWidth = SrcVecVT.getSizeInBits();
49805 unsigned NumElts = VecWidth / DestWidth;
49806 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
49807 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
49808 SDLoc DL(N);
49809 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
49810 BitcastVec, ExtElt.getOperand(1));
49811 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
49812}
49813
49814static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
49815 const X86Subtarget &Subtarget) {
49816 bool IsStrict = N->isStrictFPOpcode();
49817 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49818 EVT VT = N->getValueType(0);
49819 EVT InVT = Op0.getValueType();
49820
49821 // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))
49822 // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
49823 // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
49824 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
49825 unsigned ScalarSize = InVT.getScalarSizeInBits();
49826 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
49827 return SDValue();
49828 SDLoc dl(N);
49829 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
49830 ScalarSize < 16 ? MVT::i16
49831 : ScalarSize < 32 ? MVT::i32
49832 : MVT::i64,
49833 InVT.getVectorNumElements());
49834 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
49835 if (IsStrict)
49836 return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
49837 {N->getOperand(0), P});
49838 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
49839 }
49840
49841 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
49842 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
49843 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
49844 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
49845 VT.getScalarType() != MVT::f16) {
49846 SDLoc dl(N);
49847 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49848 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
49849
49850 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
49851 if (IsStrict)
49852 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49853 {N->getOperand(0), P});
49854 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49855 }
49856
49857 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
49858 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
49859 // the optimization here.
49860 if (DAG.SignBitIsZero(Op0)) {
49861 if (IsStrict)
49862 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
49863 {N->getOperand(0), Op0});
49864 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
49865 }
49866
49867 return SDValue();
49868}
49869
49870static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
49871 TargetLowering::DAGCombinerInfo &DCI,
49872 const X86Subtarget &Subtarget) {
49873 // First try to optimize away the conversion entirely when it's
49874 // conditionally from a constant. Vectors only.
49875 bool IsStrict = N->isStrictFPOpcode();
49876 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
49877 return Res;
49878
49879 // Now move on to more general possibilities.
49880 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
49881 EVT VT = N->getValueType(0);
49882 EVT InVT = Op0.getValueType();
49883
49884 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
49885 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
49886 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
49887 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
49888 unsigned ScalarSize = InVT.getScalarSizeInBits();
49889 if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
49890 return SDValue();
49891 SDLoc dl(N);
49892 EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
49893 ScalarSize < 16 ? MVT::i16
49894 : ScalarSize < 32 ? MVT::i32
49895 : MVT::i64,
49896 InVT.getVectorNumElements());
49897 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
49898 if (IsStrict)
49899 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49900 {N->getOperand(0), P});
49901 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49902 }
49903
49904 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
49905 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
49906 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
49907 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
49908 VT.getScalarType() != MVT::f16) {
49909 SDLoc dl(N);
49910 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
49911 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
49912 if (IsStrict)
49913 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49914 {N->getOperand(0), P});
49915 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
49916 }
49917
49918 // Without AVX512DQ we only support i64 to float scalar conversion. For both
49919 // vectors and scalars, see if we know that the upper bits are all the sign
49920 // bit, in which case we can truncate the input to i32 and convert from that.
49921 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
49922 unsigned BitWidth = InVT.getScalarSizeInBits();
49923 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
49924 if (NumSignBits >= (BitWidth - 31)) {
49925 EVT TruncVT = MVT::i32;
49926 if (InVT.isVector())
49927 TruncVT = InVT.changeVectorElementType(TruncVT);
49928 SDLoc dl(N);
49929 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
49930 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
49931 if (IsStrict)
49932 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
49933 {N->getOperand(0), Trunc});
49934 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
49935 }
49936 // If we're after legalize and the type is v2i32 we need to shuffle and
49937 // use CVTSI2P.
49938 assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49938, __extension__ __PRETTY_FUNCTION__))
;
49939 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
49940 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
49941 { 0, 2, -1, -1 });
49942 if (IsStrict)
49943 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
49944 {N->getOperand(0), Shuf});
49945 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
49946 }
49947 }
49948
49949 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
49950 // a 32-bit target where SSE doesn't support i64->FP operations.
49951 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
49952 Op0.getOpcode() == ISD::LOAD) {
49953 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
49954
49955 // This transformation is not supported if the result type is f16 or f128.
49956 if (VT == MVT::f16 || VT == MVT::f128)
49957 return SDValue();
49958
49959 // If we have AVX512DQ we can use packed conversion instructions unless
49960 // the VT is f80.
49961 if (Subtarget.hasDQI() && VT != MVT::f80)
49962 return SDValue();
49963
49964 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
49965 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
49966 std::pair<SDValue, SDValue> Tmp =
49967 Subtarget.getTargetLowering()->BuildFILD(
49968 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
49969 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
49970 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
49971 return Tmp.first;
49972 }
49973 }
49974
49975 if (IsStrict)
49976 return SDValue();
49977
49978 if (SDValue V = combineToFPTruncExtElt(N, DAG))
49979 return V;
49980
49981 return SDValue();
49982}
49983
49984static bool needCarryOrOverflowFlag(SDValue Flags) {
49985 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49985, __extension__ __PRETTY_FUNCTION__))
;
49986
49987 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
49988 UI != UE; ++UI) {
49989 SDNode *User = *UI;
49990
49991 X86::CondCode CC;
49992 switch (User->getOpcode()) {
49993 default:
49994 // Be conservative.
49995 return true;
49996 case X86ISD::SETCC:
49997 case X86ISD::SETCC_CARRY:
49998 CC = (X86::CondCode)User->getConstantOperandVal(0);
49999 break;
50000 case X86ISD::BRCOND:
50001 CC = (X86::CondCode)User->getConstantOperandVal(2);
50002 break;
50003 case X86ISD::CMOV:
50004 CC = (X86::CondCode)User->getConstantOperandVal(2);
50005 break;
50006 }
50007
50008 switch (CC) {
50009 default: break;
50010 case X86::COND_A: case X86::COND_AE:
50011 case X86::COND_B: case X86::COND_BE:
50012 case X86::COND_O: case X86::COND_NO:
50013 case X86::COND_G: case X86::COND_GE:
50014 case X86::COND_L: case X86::COND_LE:
50015 return true;
50016 }
50017 }
50018
50019 return false;
50020}
50021
50022static bool onlyZeroFlagUsed(SDValue Flags) {
50023 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50023, __extension__ __PRETTY_FUNCTION__))
;
50024
50025 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
50026 UI != UE; ++UI) {
50027 SDNode *User = *UI;
50028
50029 unsigned CCOpNo;
50030 switch (User->getOpcode()) {
50031 default:
50032 // Be conservative.
50033 return false;
50034 case X86ISD::SETCC: CCOpNo = 0; break;
50035 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
50036 case X86ISD::BRCOND: CCOpNo = 2; break;
50037 case X86ISD::CMOV: CCOpNo = 2; break;
50038 }
50039
50040 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
50041 if (CC != X86::COND_E && CC != X86::COND_NE)
50042 return false;
50043 }
50044
50045 return true;
50046}
50047
50048static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
50049 // Only handle test patterns.
50050 if (!isNullConstant(N->getOperand(1)))
50051 return SDValue();
50052
50053 // If we have a CMP of a truncated binop, see if we can make a smaller binop
50054 // and use its flags directly.
50055 // TODO: Maybe we should try promoting compares that only use the zero flag
50056 // first if we can prove the upper bits with computeKnownBits?
50057 SDLoc dl(N);
50058 SDValue Op = N->getOperand(0);
50059 EVT VT = Op.getValueType();
50060
50061 // If we have a constant logical shift that's only used in a comparison
50062 // against zero turn it into an equivalent AND. This allows turning it into
50063 // a TEST instruction later.
50064 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
50065 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
50066 onlyZeroFlagUsed(SDValue(N, 0))) {
50067 unsigned BitWidth = VT.getSizeInBits();
50068 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
50069 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
50070 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
50071 APInt Mask = Op.getOpcode() == ISD::SRL
50072 ? APInt::getHighBitsSet(BitWidth, MaskBits)
50073 : APInt::getLowBitsSet(BitWidth, MaskBits);
50074 if (Mask.isSignedIntN(32)) {
50075 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
50076 DAG.getConstant(Mask, dl, VT));
50077 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
50078 DAG.getConstant(0, dl, VT));
50079 }
50080 }
50081 }
50082
50083 // Look for a truncate.
50084 if (Op.getOpcode() != ISD::TRUNCATE)
50085 return SDValue();
50086
50087 SDValue Trunc = Op;
50088 Op = Op.getOperand(0);
50089
50090 // See if we can compare with zero against the truncation source,
50091 // which should help using the Z flag from many ops. Only do this for
50092 // i32 truncated op to prevent partial-reg compares of promoted ops.
50093 EVT OpVT = Op.getValueType();
50094 APInt UpperBits =
50095 APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());
50096 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
50097 onlyZeroFlagUsed(SDValue(N, 0))) {
50098 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
50099 DAG.getConstant(0, dl, OpVT));
50100 }
50101
50102 // After this the truncate and arithmetic op must have a single use.
50103 if (!Trunc.hasOneUse() || !Op.hasOneUse())
50104 return SDValue();
50105
50106 unsigned NewOpc;
50107 switch (Op.getOpcode()) {
50108 default: return SDValue();
50109 case ISD::AND:
50110 // Skip and with constant. We have special handling for and with immediate
50111 // during isel to generate test instructions.
50112 if (isa<ConstantSDNode>(Op.getOperand(1)))
50113 return SDValue();
50114 NewOpc = X86ISD::AND;
50115 break;
50116 case ISD::OR: NewOpc = X86ISD::OR; break;
50117 case ISD::XOR: NewOpc = X86ISD::XOR; break;
50118 case ISD::ADD:
50119 // If the carry or overflow flag is used, we can't truncate.
50120 if (needCarryOrOverflowFlag(SDValue(N, 0)))
50121 return SDValue();
50122 NewOpc = X86ISD::ADD;
50123 break;
50124 case ISD::SUB:
50125 // If the carry or overflow flag is used, we can't truncate.
50126 if (needCarryOrOverflowFlag(SDValue(N, 0)))
50127 return SDValue();
50128 NewOpc = X86ISD::SUB;
50129 break;
50130 }
50131
50132 // We found an op we can narrow. Truncate its inputs.
50133 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
50134 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
50135
50136 // Use a X86 specific opcode to avoid DAG combine messing with it.
50137 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50138 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
50139
50140 // For AND, keep a CMP so that we can match the test pattern.
50141 if (NewOpc == X86ISD::AND)
50142 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
50143 DAG.getConstant(0, dl, VT));
50144
50145 // Return the flags.
50146 return Op.getValue(1);
50147}
50148
50149static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
50150 TargetLowering::DAGCombinerInfo &DCI) {
50151 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50152, __extension__ __PRETTY_FUNCTION__))
50152 "Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50152, __extension__ __PRETTY_FUNCTION__))
;
50153
50154 SDLoc DL(N);
50155 SDValue LHS = N->getOperand(0);
50156 SDValue RHS = N->getOperand(1);
50157 MVT VT = LHS.getSimpleValueType();
50158 unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
50159
50160 // If we don't use the flag result, simplify back to a generic ADD/SUB.
50161 if (!N->hasAnyUseOfValue(1)) {
50162 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
50163 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
50164 }
50165
50166 // Fold any similar generic ADD/SUB opcodes to reuse this node.
50167 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
50168 SDValue Ops[] = {N0, N1};
50169 SDVTList VTs = DAG.getVTList(N->getValueType(0));
50170 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
50171 SDValue Op(N, 0);
50172 if (Negate)
50173 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
50174 DCI.CombineTo(GenericAddSub, Op);
50175 }
50176 };
50177 MatchGeneric(LHS, RHS, false);
50178 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
50179
50180 return SDValue();
50181}
50182
50183static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
50184 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
50185 MVT VT = N->getSimpleValueType(0);
50186 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50187 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
50188 N->getOperand(0), N->getOperand(1),
50189 Flags);
50190 }
50191
50192 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
50193 // iff the flag result is dead.
50194 SDValue Op0 = N->getOperand(0);
50195 SDValue Op1 = N->getOperand(1);
50196 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
50197 !N->hasAnyUseOfValue(1))
50198 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
50199 Op0.getOperand(1), N->getOperand(2));
50200
50201 return SDValue();
50202}
50203
50204// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
50205static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
50206 TargetLowering::DAGCombinerInfo &DCI) {
50207 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
50208 // the result is either zero or one (depending on the input carry bit).
50209 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
50210 if (X86::isZeroNode(N->getOperand(0)) &&
50211 X86::isZeroNode(N->getOperand(1)) &&
50212 // We don't have a good way to replace an EFLAGS use, so only do this when
50213 // dead right now.
50214 SDValue(N, 1).use_empty()) {
50215 SDLoc DL(N);
50216 EVT VT = N->getValueType(0);
50217 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
50218 SDValue Res1 =
50219 DAG.getNode(ISD::AND, DL, VT,
50220 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50221 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50222 N->getOperand(2)),
50223 DAG.getConstant(1, DL, VT));
50224 return DCI.CombineTo(N, Res1, CarryOut);
50225 }
50226
50227 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
50228 MVT VT = N->getSimpleValueType(0);
50229 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50230 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
50231 N->getOperand(0), N->getOperand(1),
50232 Flags);
50233 }
50234
50235 return SDValue();
50236}
50237
50238/// If this is an add or subtract where one operand is produced by a cmp+setcc,
50239/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
50240/// with CMP+{ADC, SBB}.
50241static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
50242 bool IsSub = N->getOpcode() == ISD::SUB;
50243 SDValue X = N->getOperand(0);
50244 SDValue Y = N->getOperand(1);
50245
50246 // If this is an add, canonicalize a zext operand to the RHS.
50247 // TODO: Incomplete? What if both sides are zexts?
50248 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
50249 Y.getOpcode() != ISD::ZERO_EXTEND)
50250 std::swap(X, Y);
50251
50252 // Look through a one-use zext.
50253 bool PeekedThroughZext = false;
50254 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
50255 Y = Y.getOperand(0);
50256 PeekedThroughZext = true;
50257 }
50258
50259 // If this is an add, canonicalize a setcc operand to the RHS.
50260 // TODO: Incomplete? What if both sides are setcc?
50261 // TODO: Should we allow peeking through a zext of the other operand?
50262 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
50263 Y.getOpcode() != X86ISD::SETCC)
50264 std::swap(X, Y);
50265
50266 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
50267 return SDValue();
50268
50269 SDLoc DL(N);
50270 EVT VT = N->getValueType(0);
50271 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
50272
50273 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50274 // the general case below.
50275 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
50276 if (ConstantX) {
50277 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
50278 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
50279 // This is a complicated way to get -1 or 0 from the carry flag:
50280 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50281 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
50282 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50283 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50284 Y.getOperand(1));
50285 }
50286
50287 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
50288 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
50289 SDValue EFLAGS = Y->getOperand(1);
50290 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
50291 EFLAGS.getValueType().isInteger() &&
50292 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50293 // Swap the operands of a SUB, and we have the same pattern as above.
50294 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
50295 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
50296 SDValue NewSub = DAG.getNode(
50297 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50298 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50299 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
50300 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50301 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50302 NewEFLAGS);
50303 }
50304 }
50305 }
50306
50307 if (CC == X86::COND_B) {
50308 // X + SETB Z --> adc X, 0
50309 // X - SETB Z --> sbb X, 0
50310 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50311 DAG.getVTList(VT, MVT::i32), X,
50312 DAG.getConstant(0, DL, VT), Y.getOperand(1));
50313 }
50314
50315 if (CC == X86::COND_A) {
50316 SDValue EFLAGS = Y.getOperand(1);
50317 // Try to convert COND_A into COND_B in an attempt to facilitate
50318 // materializing "setb reg".
50319 //
50320 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
50321 // cannot take an immediate as its first operand.
50322 //
50323 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50324 EFLAGS.getValueType().isInteger() &&
50325 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50326 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
50327 EFLAGS.getNode()->getVTList(),
50328 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50329 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50330 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
50331 DAG.getVTList(VT, MVT::i32), X,
50332 DAG.getConstant(0, DL, VT), NewEFLAGS);
50333 }
50334 }
50335
50336 if (CC == X86::COND_AE) {
50337 // X + SETAE --> sbb X, -1
50338 // X - SETAE --> adc X, -1
50339 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50340 DAG.getVTList(VT, MVT::i32), X,
50341 DAG.getConstant(-1, DL, VT), Y.getOperand(1));
50342 }
50343
50344 if (CC == X86::COND_BE) {
50345 // X + SETBE --> sbb X, -1
50346 // X - SETBE --> adc X, -1
50347 SDValue EFLAGS = Y.getOperand(1);
50348 // Try to convert COND_BE into COND_AE in an attempt to facilitate
50349 // materializing "setae reg".
50350 //
50351 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
50352 // cannot take an immediate as its first operand.
50353 //
50354 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
50355 EFLAGS.getValueType().isInteger() &&
50356 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
50357 SDValue NewSub = DAG.getNode(
50358 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
50359 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
50360 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
50361 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
50362 DAG.getVTList(VT, MVT::i32), X,
50363 DAG.getConstant(-1, DL, VT), NewEFLAGS);
50364 }
50365 }
50366
50367 if (CC != X86::COND_E && CC != X86::COND_NE)
50368 return SDValue();
50369
50370 SDValue Cmp = Y.getOperand(1);
50371 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
50372 !X86::isZeroNode(Cmp.getOperand(1)) ||
50373 !Cmp.getOperand(0).getValueType().isInteger())
50374 return SDValue();
50375
50376 SDValue Z = Cmp.getOperand(0);
50377 EVT ZVT = Z.getValueType();
50378
50379 // If X is -1 or 0, then we have an opportunity to avoid constants required in
50380 // the general case below.
50381 if (ConstantX) {
50382 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
50383 // fake operands:
50384 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
50385 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
50386 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
50387 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
50388 SDValue Zero = DAG.getConstant(0, DL, ZVT);
50389 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50390 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
50391 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50392 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50393 SDValue(Neg.getNode(), 1));
50394 }
50395
50396 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
50397 // with fake operands:
50398 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
50399 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
50400 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
50401 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
50402 SDValue One = DAG.getConstant(1, DL, ZVT);
50403 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50404 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50405 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
50406 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
50407 Cmp1.getValue(1));
50408 }
50409 }
50410
50411 // (cmp Z, 1) sets the carry flag if Z is 0.
50412 SDValue One = DAG.getConstant(1, DL, ZVT);
50413 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
50414 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
50415
50416 // Add the flags type for ADC/SBB nodes.
50417 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
50418
50419 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
50420 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
50421 if (CC == X86::COND_NE)
50422 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
50423 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
50424
50425 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
50426 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
50427 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
50428 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
50429}
50430
50431static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
50432 const SDLoc &DL, EVT VT,
50433 const X86Subtarget &Subtarget) {
50434 // Example of pattern we try to detect:
50435 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
50436 //(add (build_vector (extract_elt t, 0),
50437 // (extract_elt t, 2),
50438 // (extract_elt t, 4),
50439 // (extract_elt t, 6)),
50440 // (build_vector (extract_elt t, 1),
50441 // (extract_elt t, 3),
50442 // (extract_elt t, 5),
50443 // (extract_elt t, 7)))
50444
50445 if (!Subtarget.hasSSE2())
50446 return SDValue();
50447
50448 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
50449 Op1.getOpcode() != ISD::BUILD_VECTOR)
50450 return SDValue();
50451
50452 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
50453 VT.getVectorNumElements() < 4 ||
50454 !isPowerOf2_32(VT.getVectorNumElements()))
50455 return SDValue();
50456
50457 // Check if one of Op0,Op1 is of the form:
50458 // (build_vector (extract_elt Mul, 0),
50459 // (extract_elt Mul, 2),
50460 // (extract_elt Mul, 4),
50461 // ...
50462 // the other is of the form:
50463 // (build_vector (extract_elt Mul, 1),
50464 // (extract_elt Mul, 3),
50465 // (extract_elt Mul, 5),
50466 // ...
50467 // and identify Mul.
50468 SDValue Mul;
50469 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
50470 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
50471 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
50472 // TODO: Be more tolerant to undefs.
50473 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50474 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50475 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50476 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
50477 return SDValue();
50478 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
50479 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
50480 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
50481 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
50482 if (!Const0L || !Const1L || !Const0H || !Const1H)
50483 return SDValue();
50484 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
50485 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
50486 // Commutativity of mul allows factors of a product to reorder.
50487 if (Idx0L > Idx1L)
50488 std::swap(Idx0L, Idx1L);
50489 if (Idx0H > Idx1H)
50490 std::swap(Idx0H, Idx1H);
50491 // Commutativity of add allows pairs of factors to reorder.
50492 if (Idx0L > Idx0H) {
50493 std::swap(Idx0L, Idx0H);
50494 std::swap(Idx1L, Idx1H);
50495 }
50496 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
50497 Idx1H != 2 * i + 3)
50498 return SDValue();
50499 if (!Mul) {
50500 // First time an extract_elt's source vector is visited. Must be a MUL
50501 // with 2X number of vector elements than the BUILD_VECTOR.
50502 // Both extracts must be from same MUL.
50503 Mul = Op0L->getOperand(0);
50504 if (Mul->getOpcode() != ISD::MUL ||
50505 Mul.getValueType().getVectorNumElements() != 2 * e)
50506 return SDValue();
50507 }
50508 // Check that the extract is from the same MUL previously seen.
50509 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
50510 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
50511 return SDValue();
50512 }
50513
50514 // Check if the Mul source can be safely shrunk.
50515 ShrinkMode Mode;
50516 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
50517 Mode == ShrinkMode::MULU16)
50518 return SDValue();
50519
50520 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
50521 VT.getVectorNumElements() * 2);
50522 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
50523 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
50524
50525 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
50526 ArrayRef<SDValue> Ops) {
50527 EVT InVT = Ops[0].getValueType();
50528 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50528, __extension__ __PRETTY_FUNCTION__))
;
50529 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
50530 InVT.getVectorNumElements() / 2);
50531 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
50532 };
50533 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
50534}
50535
50536// Attempt to turn this pattern into PMADDWD.
50537// (add (mul (sext (build_vector)), (sext (build_vector))),
50538// (mul (sext (build_vector)), (sext (build_vector)))
50539static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
50540 const SDLoc &DL, EVT VT,
50541 const X86Subtarget &Subtarget) {
50542 if (!Subtarget.hasSSE2())
50543 return SDValue();
50544
50545 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
50546 return SDValue();
50547
50548 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
50549 VT.getVectorNumElements() < 4 ||
50550 !isPowerOf2_32(VT.getVectorNumElements()))
50551 return SDValue();
50552
50553 SDValue N00 = N0.getOperand(0);
50554 SDValue N01 = N0.getOperand(1);
50555 SDValue N10 = N1.getOperand(0);
50556 SDValue N11 = N1.getOperand(1);
50557
50558 // All inputs need to be sign extends.
50559 // TODO: Support ZERO_EXTEND from known positive?
50560 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
50561 N01.getOpcode() != ISD::SIGN_EXTEND ||
50562 N10.getOpcode() != ISD::SIGN_EXTEND ||
50563 N11.getOpcode() != ISD::SIGN_EXTEND)
50564 return SDValue();
50565
50566 // Peek through the extends.
50567 N00 = N00.getOperand(0);
50568 N01 = N01.getOperand(0);
50569 N10 = N10.getOperand(0);
50570 N11 = N11.getOperand(0);
50571
50572 // Must be extending from vXi16.
50573 EVT InVT = N00.getValueType();
50574 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
50575 N10.getValueType() != InVT || N11.getValueType() != InVT)
50576 return SDValue();
50577
50578 // All inputs should be build_vectors.
50579 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
50580 N01.getOpcode() != ISD::BUILD_VECTOR ||
50581 N10.getOpcode() != ISD::BUILD_VECTOR ||
50582 N11.getOpcode() != ISD::BUILD_VECTOR)
50583 return SDValue();
50584
50585 // For each element, we need to ensure we have an odd element from one vector
50586 // multiplied by the odd element of another vector and the even element from
50587 // one of the same vectors being multiplied by the even element from the
50588 // other vector. So we need to make sure for each element i, this operator
50589 // is being performed:
50590 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
50591 SDValue In0, In1;
50592 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
50593 SDValue N00Elt = N00.getOperand(i);
50594 SDValue N01Elt = N01.getOperand(i);
50595 SDValue N10Elt = N10.getOperand(i);
50596 SDValue N11Elt = N11.getOperand(i);
50597 // TODO: Be more tolerant to undefs.
50598 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50599 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50600 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
50601 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
50602 return SDValue();
50603 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
50604 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
50605 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
50606 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
50607 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
50608 return SDValue();
50609 unsigned IdxN00 = ConstN00Elt->getZExtValue();
50610 unsigned IdxN01 = ConstN01Elt->getZExtValue();
50611 unsigned IdxN10 = ConstN10Elt->getZExtValue();
50612 unsigned IdxN11 = ConstN11Elt->getZExtValue();
50613 // Add is commutative so indices can be reordered.
50614 if (IdxN00 > IdxN10) {
50615 std::swap(IdxN00, IdxN10);
50616 std::swap(IdxN01, IdxN11);
50617 }
50618 // N0 indices be the even element. N1 indices must be the next odd element.
50619 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
50620 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
50621 return SDValue();
50622 SDValue N00In = N00Elt.getOperand(0);
50623 SDValue N01In = N01Elt.getOperand(0);
50624 SDValue N10In = N10Elt.getOperand(0);
50625 SDValue N11In = N11Elt.getOperand(0);
50626
50627 // First time we find an input capture it.
50628 if (!In0) {
50629 In0 = N00In;
50630 In1 = N01In;
50631
50632 // The input vectors must be at least as wide as the output.
50633 // If they are larger than the output, we extract subvector below.
50634 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
50635 In1.getValueSizeInBits() < VT.getSizeInBits())
50636 return SDValue();
50637 }
50638 // Mul is commutative so the input vectors can be in any order.
50639 // Canonicalize to make the compares easier.
50640 if (In0 != N00In)
50641 std::swap(N00In, N01In);
50642 if (In0 != N10In)
50643 std::swap(N10In, N11In);
50644 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
50645 return SDValue();
50646 }
50647
50648 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
50649 ArrayRef<SDValue> Ops) {
50650 EVT OpVT = Ops[0].getValueType();
50651 assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50652, __extension__ __PRETTY_FUNCTION__))
50652 "Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50652, __extension__ __PRETTY_FUNCTION__))
;
50653 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50653, __extension__ __PRETTY_FUNCTION__))
;
50654 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
50655 OpVT.getVectorNumElements() / 2);
50656 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
50657 };
50658
50659 // If the output is narrower than an input, extract the low part of the input
50660 // vector.
50661 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
50662 VT.getVectorNumElements() * 2);
50663 if (OutVT16.bitsLT(In0.getValueType())) {
50664 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
50665 DAG.getIntPtrConstant(0, DL));
50666 }
50667 if (OutVT16.bitsLT(In1.getValueType())) {
50668 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
50669 DAG.getIntPtrConstant(0, DL));
50670 }
50671 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
50672 PMADDBuilder);
50673}
50674
50675/// CMOV of constants requires materializing constant operands in registers.
50676/// Try to fold those constants into an 'add' instruction to reduce instruction
50677/// count. We do this with CMOV rather the generic 'select' because there are
50678/// earlier folds that may be used to turn select-of-constants into logic hacks.
50679static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
50680 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
50681 // better because we eliminate 1-2 instructions. This transform is still
50682 // an improvement without zero operands because we trade 2 move constants and
50683 // 1 add for 2 adds (LEA) as long as the constants can be represented as
50684 // immediate asm operands (fit in 32-bits).
50685 auto isSuitableCmov = [](SDValue V) {
50686 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
50687 return false;
50688 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
50689 !isa<ConstantSDNode>(V.getOperand(1)))
50690 return false;
50691 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
50692 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
50693 V.getConstantOperandAPInt(1).isSignedIntN(32));
50694 };
50695
50696 // Match an appropriate CMOV as the first operand of the add.
50697 SDValue Cmov = N->getOperand(0);
50698 SDValue OtherOp = N->getOperand(1);
50699 if (!isSuitableCmov(Cmov))
50700 std::swap(Cmov, OtherOp);
50701 if (!isSuitableCmov(Cmov))
50702 return SDValue();
50703
50704 EVT VT = N->getValueType(0);
50705 SDLoc DL(N);
50706 SDValue FalseOp = Cmov.getOperand(0);
50707 SDValue TrueOp = Cmov.getOperand(1);
50708
50709 // We will push the add through the select, but we can potentially do better
50710 // if we know there is another add in the sequence and this is pointer math.
50711 // In that case, we can absorb an add into the trailing memory op and avoid
50712 // a 3-operand LEA which is likely slower than a 2-operand LEA.
50713 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
50714 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
50715 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
50716 all_of(N->uses(), [&](SDNode *Use) {
50717 auto *MemNode = dyn_cast<MemSDNode>(Use);
50718 return MemNode && MemNode->getBasePtr().getNode() == N;
50719 })) {
50720 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
50721 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
50722 // it is possible that choosing op1 might be better.
50723 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
50724 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
50725 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
50726 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
50727 Cmov.getOperand(2), Cmov.getOperand(3));
50728 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
50729 }
50730
50731 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
50732 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
50733 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
50734 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
50735 Cmov.getOperand(3));
50736}
50737
50738static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
50739 TargetLowering::DAGCombinerInfo &DCI,
50740 const X86Subtarget &Subtarget) {
50741 EVT VT = N->getValueType(0);
50742 SDValue Op0 = N->getOperand(0);
50743 SDValue Op1 = N->getOperand(1);
50744
50745 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG))
50746 return Select;
50747
50748 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
50749 return MAdd;
50750 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
50751 return MAdd;
50752
50753 // Try to synthesize horizontal adds from adds of shuffles.
50754 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
50755 return V;
50756
50757 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
50758 // (sub Y, (sext (vXi1 X))).
50759 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
50760 // generic DAG combine without a legal type check, but adding this there
50761 // caused regressions.
50762 if (VT.isVector()) {
50763 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50764 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
50765 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50766 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
50767 SDLoc DL(N);
50768 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
50769 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
50770 }
50771
50772 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
50773 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
50774 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
50775 SDLoc DL(N);
50776 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
50777 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
50778 }
50779 }
50780
50781 return combineAddOrSubToADCOrSBB(N, DAG);
50782}
50783
50784static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
50785 TargetLowering::DAGCombinerInfo &DCI,
50786 const X86Subtarget &Subtarget) {
50787 SDValue Op0 = N->getOperand(0);
50788 SDValue Op1 = N->getOperand(1);
50789
50790 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
50791 auto IsNonOpaqueConstant = [&](SDValue Op) {
50792 if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {
50793 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
50794 return !Cst->isOpaque();
50795 return true;
50796 }
50797 return false;
50798 };
50799
50800 // X86 can't encode an immediate LHS of a sub. See if we can push the
50801 // negation into a preceding instruction. If the RHS of the sub is a XOR with
50802 // one use and a constant, invert the immediate, saving one register.
50803 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
50804 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
50805 IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {
50806 SDLoc DL(N);
50807 EVT VT = Op0.getValueType();
50808 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
50809 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
50810 SDValue NewAdd =
50811 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
50812 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
50813 }
50814
50815 // Try to synthesize horizontal subs from subs of shuffles.
50816 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
50817 return V;
50818
50819 return combineAddOrSubToADCOrSBB(N, DAG);
50820}
50821
50822static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
50823 const X86Subtarget &Subtarget) {
50824 MVT VT = N->getSimpleValueType(0);
50825 SDLoc DL(N);
50826
50827 if (N->getOperand(0) == N->getOperand(1)) {
50828 if (N->getOpcode() == X86ISD::PCMPEQ)
50829 return DAG.getConstant(-1, DL, VT);
50830 if (N->getOpcode() == X86ISD::PCMPGT)
50831 return DAG.getConstant(0, DL, VT);
50832 }
50833
50834 return SDValue();
50835}
50836
50837/// Helper that combines an array of subvector ops as if they were the operands
50838/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
50839/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
50840static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
50841 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
50842 TargetLowering::DAGCombinerInfo &DCI,
50843 const X86Subtarget &Subtarget) {
50844 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50844, __extension__ __PRETTY_FUNCTION__))
;
50845 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50846
50847 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
50848 return DAG.getUNDEF(VT);
50849
50850 if (llvm::all_of(Ops, [](SDValue Op) {
50851 return ISD::isBuildVectorAllZeros(Op.getNode());
50852 }))
50853 return getZeroVector(VT, Subtarget, DAG, DL);
50854
50855 SDValue Op0 = Ops[0];
50856 bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
50857
50858 // Repeated subvectors.
50859 if (IsSplat &&
50860 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
50861 // If this broadcast is inserted into both halves, use a larger broadcast.
50862 if (Op0.getOpcode() == X86ISD::VBROADCAST)
50863 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
50864
50865 // If this scalar/subvector broadcast_load is inserted into both halves, use
50866 // a larger broadcast_load. Update other uses to use an extracted subvector.
50867 if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||
50868 Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
50869 auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
50870 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
50871 SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
50872 SDValue BcastLd = DAG.getMemIntrinsicNode(Op0.getOpcode(), DL, Tys, Ops,
50873 MemIntr->getMemoryVT(),
50874 MemIntr->getMemOperand());
50875 DAG.ReplaceAllUsesOfValueWith(
50876 Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
50877 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
50878 return BcastLd;
50879 }
50880
50881 // If this is a simple subvector load repeated across multiple lanes, then
50882 // broadcast the load. Update other uses to use an extracted subvector.
50883 if (auto *Ld = dyn_cast<LoadSDNode>(Op0)) {
50884 if (Ld->isSimple() && !Ld->isNonTemporal() &&
50885 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
50886 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
50887 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
50888 SDValue BcastLd =
50889 DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops,
50890 Ld->getMemoryVT(), Ld->getMemOperand());
50891 DAG.ReplaceAllUsesOfValueWith(
50892 Op0,
50893 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
50894 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1));
50895 return BcastLd;
50896 }
50897 }
50898
50899 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
50900 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
50901 (Subtarget.hasAVX2() || MayFoldLoadIntoBroadcastFromMem(
50902 Op0.getOperand(0), VT.getScalarType())))
50903 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
50904 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
50905 Op0.getOperand(0),
50906 DAG.getIntPtrConstant(0, DL)));
50907
50908 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
50909 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50910 (Subtarget.hasAVX2() ||
50911 (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
50912 Op0.getOperand(0).getValueType() == VT.getScalarType())
50913 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
50914
50915 // concat_vectors(extract_subvector(broadcast(x)),
50916 // extract_subvector(broadcast(x))) -> broadcast(x)
50917 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50918 Op0.getOperand(0).getValueType() == VT) {
50919 if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
50920 Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
50921 return Op0.getOperand(0);
50922 }
50923 }
50924
50925 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
50926 // Only concat of subvector high halves which vperm2x128 is best at.
50927 // TODO: This should go in combineX86ShufflesRecursively eventually.
50928 if (VT.is256BitVector() && Ops.size() == 2) {
50929 SDValue Src0 = peekThroughBitcasts(Ops[0]);
50930 SDValue Src1 = peekThroughBitcasts(Ops[1]);
50931 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
50932 Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
50933 EVT SrcVT0 = Src0.getOperand(0).getValueType();
50934 EVT SrcVT1 = Src1.getOperand(0).getValueType();
50935 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
50936 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
50937 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
50938 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
50939 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
50940 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
50941 DAG.getBitcast(VT, Src0.getOperand(0)),
50942 DAG.getBitcast(VT, Src1.getOperand(0)),
50943 DAG.getTargetConstant(0x31, DL, MVT::i8));
50944 }
50945 }
50946 }
50947
50948 // Repeated opcode.
50949 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
50950 // but it currently struggles with different vector widths.
50951 if (llvm::all_of(Ops, [Op0](SDValue Op) {
50952 return Op.getOpcode() == Op0.getOpcode();
50953 })) {
50954 auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
50955 SmallVector<SDValue> Subs;
50956 for (SDValue SubOp : SubOps)
50957 Subs.push_back(SubOp.getOperand(I));
50958 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
50959 };
50960
50961 unsigned NumOps = Ops.size();
50962 switch (Op0.getOpcode()) {
50963 case X86ISD::SHUFP: {
50964 // Add SHUFPD support if/when necessary.
50965 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
50966 llvm::all_of(Ops, [Op0](SDValue Op) {
50967 return Op.getOperand(2) == Op0.getOperand(2);
50968 })) {
50969 return DAG.getNode(Op0.getOpcode(), DL, VT,
50970 ConcatSubOperand(VT, Ops, 0),
50971 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
50972 }
50973 break;
50974 }
50975 case X86ISD::PSHUFHW:
50976 case X86ISD::PSHUFLW:
50977 case X86ISD::PSHUFD:
50978 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
50979 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
50980 return DAG.getNode(Op0.getOpcode(), DL, VT,
50981 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
50982 }
50983 LLVM_FALLTHROUGH[[gnu::fallthrough]];
50984 case X86ISD::VPERMILPI:
50985 // TODO - add support for vXf64/vXi64 shuffles.
50986 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
50987 Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
50988 SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
50989 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
50990 Op0.getOperand(1));
50991 return DAG.getBitcast(VT, Res);
50992 }
50993 break;
50994 case X86ISD::VPERMV3:
50995 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
50996 MVT OpVT = Op0.getSimpleValueType();
50997 int NumSrcElts = OpVT.getVectorNumElements();
50998 SmallVector<int, 64> ConcatMask;
50999 for (unsigned i = 0; i != NumOps; ++i) {
51000 SmallVector<int, 64> SubMask;
51001 SmallVector<SDValue, 2> SubOps;
51002 if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,
51003 SubMask))
51004 break;
51005 for (int M : SubMask) {
51006 if (0 <= M) {
51007 M += M < NumSrcElts ? 0 : NumSrcElts;
51008 M += i * NumSrcElts;
51009 }
51010 ConcatMask.push_back(M);
51011 }
51012 }
51013 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
51014 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
51015 Ops[1].getOperand(0), DAG, DL);
51016 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
51017 Ops[1].getOperand(2), DAG, DL);
51018 MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
51019 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
51020 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
51021 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
51022 }
51023 }
51024 break;
51025 case X86ISD::VSHLI:
51026 case X86ISD::VSRLI:
51027 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
51028 // TODO: Move this to LowerScalarImmediateShift?
51029 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
51030 llvm::all_of(Ops, [](SDValue Op) {
51031 return Op.getConstantOperandAPInt(1) == 32;
51032 })) {
51033 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
51034 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
51035 if (Op0.getOpcode() == X86ISD::VSHLI) {
51036 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
51037 {8, 0, 8, 2, 8, 4, 8, 6});
51038 } else {
51039 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
51040 {1, 8, 3, 8, 5, 8, 7, 8});
51041 }
51042 return DAG.getBitcast(VT, Res);
51043 }
51044 LLVM_FALLTHROUGH[[gnu::fallthrough]];
51045 case X86ISD::VSRAI:
51046 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
51047 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
51048 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
51049 llvm::all_of(Ops, [Op0](SDValue Op) {
51050 return Op0.getOperand(1) == Op.getOperand(1);
51051 })) {
51052 return DAG.getNode(Op0.getOpcode(), DL, VT,
51053 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
51054 }
51055 break;
51056 case X86ISD::VPERMI:
51057 case X86ISD::VROTLI:
51058 case X86ISD::VROTRI:
51059 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
51060 llvm::all_of(Ops, [Op0](SDValue Op) {
51061 return Op0.getOperand(1) == Op.getOperand(1);
51062 })) {
51063 return DAG.getNode(Op0.getOpcode(), DL, VT,
51064 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
51065 }
51066 break;
51067 case ISD::AND:
51068 case ISD::OR:
51069 case ISD::XOR:
51070 case X86ISD::ANDNP:
51071 // TODO: Add 256-bit support.
51072 if (!IsSplat && VT.is512BitVector()) {
51073 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
51074 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
51075 NumOps * SrcVT.getVectorNumElements());
51076 return DAG.getNode(Op0.getOpcode(), DL, VT,
51077 ConcatSubOperand(SrcVT, Ops, 0),
51078 ConcatSubOperand(SrcVT, Ops, 1));
51079 }
51080 break;
51081 case X86ISD::HADD:
51082 case X86ISD::HSUB:
51083 case X86ISD::FHADD:
51084 case X86ISD::FHSUB:
51085 case X86ISD::PACKSS:
51086 case X86ISD::PACKUS:
51087 if (!IsSplat && VT.is256BitVector() &&
51088 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
51089 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
51090 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
51091 NumOps * SrcVT.getVectorNumElements());
51092 return DAG.getNode(Op0.getOpcode(), DL, VT,
51093 ConcatSubOperand(SrcVT, Ops, 0),
51094 ConcatSubOperand(SrcVT, Ops, 1));
51095 }
51096 break;
51097 case X86ISD::PALIGNR:
51098 if (!IsSplat &&
51099 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
51100 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
51101 llvm::all_of(Ops, [Op0](SDValue Op) {
51102 return Op0.getOperand(2) == Op.getOperand(2);
51103 })) {
51104 return DAG.getNode(Op0.getOpcode(), DL, VT,
51105 ConcatSubOperand(VT, Ops, 0),
51106 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
51107 }
51108 break;
51109 }
51110 }
51111
51112 // Fold subvector loads into one.
51113 // If needed, look through bitcasts to get to the load.
51114 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
51115 bool Fast;
51116 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
51117 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
51118 *FirstLd->getMemOperand(), &Fast) &&
51119 Fast) {
51120 if (SDValue Ld =
51121 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
51122 return Ld;
51123 }
51124 }
51125
51126 return SDValue();
51127}
51128
51129static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
51130 TargetLowering::DAGCombinerInfo &DCI,
51131 const X86Subtarget &Subtarget) {
51132 EVT VT = N->getValueType(0);
51133 EVT SrcVT = N->getOperand(0).getValueType();
51134 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51135
51136 // Don't do anything for i1 vectors.
51137 if (VT.getVectorElementType() == MVT::i1)
51138 return SDValue();
51139
51140 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
51141 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
51142 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
51143 DCI, Subtarget))
51144 return R;
51145 }
51146
51147 return SDValue();
51148}
51149
51150static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
51151 TargetLowering::DAGCombinerInfo &DCI,
51152 const X86Subtarget &Subtarget) {
51153 if (DCI.isBeforeLegalizeOps())
51154 return SDValue();
51155
51156 MVT OpVT = N->getSimpleValueType(0);
51157
51158 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
51159
51160 SDLoc dl(N);
51161 SDValue Vec = N->getOperand(0);
51162 SDValue SubVec = N->getOperand(1);
51163
51164 uint64_t IdxVal = N->getConstantOperandVal(2);
51165 MVT SubVecVT = SubVec.getSimpleValueType();
51166
51167 if (Vec.isUndef() && SubVec.isUndef())
51168 return DAG.getUNDEF(OpVT);
51169
51170 // Inserting undefs/zeros into zeros/undefs is a zero vector.
51171 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
51172 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
51173 return getZeroVector(OpVT, Subtarget, DAG, dl);
51174
51175 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
51176 // If we're inserting into a zero vector and then into a larger zero vector,
51177 // just insert into the larger zero vector directly.
51178 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
51179 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
51180 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
51181 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
51182 getZeroVector(OpVT, Subtarget, DAG, dl),
51183 SubVec.getOperand(1),
51184 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
51185 }
51186
51187 // If we're inserting into a zero vector and our input was extracted from an
51188 // insert into a zero vector of the same type and the extraction was at
51189 // least as large as the original insertion. Just insert the original
51190 // subvector into a zero vector.
51191 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
51192 isNullConstant(SubVec.getOperand(1)) &&
51193 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
51194 SDValue Ins = SubVec.getOperand(0);
51195 if (isNullConstant(Ins.getOperand(2)) &&
51196 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
51197 Ins.getOperand(1).getValueSizeInBits().getFixedSize() <=
51198 SubVecVT.getFixedSizeInBits())
51199 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
51200 getZeroVector(OpVT, Subtarget, DAG, dl),
51201 Ins.getOperand(1), N->getOperand(2));
51202 }
51203 }
51204
51205 // Stop here if this is an i1 vector.
51206 if (IsI1Vector)
51207 return SDValue();
51208
51209 // If this is an insert of an extract, combine to a shuffle. Don't do this
51210 // if the insert or extract can be represented with a subregister operation.
51211 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
51212 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
51213 (IdxVal != 0 ||
51214 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
51215 int ExtIdxVal = SubVec.getConstantOperandVal(1);
51216 if (ExtIdxVal != 0) {
51217 int VecNumElts = OpVT.getVectorNumElements();
51218 int SubVecNumElts = SubVecVT.getVectorNumElements();
51219 SmallVector<int, 64> Mask(VecNumElts);
51220 // First create an identity shuffle mask.
51221 for (int i = 0; i != VecNumElts; ++i)
51222 Mask[i] = i;
51223 // Now insert the extracted portion.
51224 for (int i = 0; i != SubVecNumElts; ++i)
51225 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
51226
51227 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
51228 }
51229 }
51230
51231 // Match concat_vector style patterns.
51232 SmallVector<SDValue, 2> SubVectorOps;
51233 if (collectConcatOps(N, SubVectorOps)) {
51234 if (SDValue Fold =
51235 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
51236 return Fold;
51237
51238 // If we're inserting all zeros into the upper half, change this to
51239 // a concat with zero. We will match this to a move
51240 // with implicit upper bit zeroing during isel.
51241 // We do this here because we don't want combineConcatVectorOps to
51242 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
51243 if (SubVectorOps.size() == 2 &&
51244 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
51245 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
51246 getZeroVector(OpVT, Subtarget, DAG, dl),
51247 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
51248 }
51249
51250 // If this is a broadcast insert into an upper undef, use a larger broadcast.
51251 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
51252 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
51253
51254 // If this is a broadcast load inserted into an upper undef, use a larger
51255 // broadcast load.
51256 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
51257 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
51258 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
51259 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
51260 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
51261 SDValue BcastLd =
51262 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
51263 MemIntr->getMemoryVT(),
51264 MemIntr->getMemOperand());
51265 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
51266 return BcastLd;
51267 }
51268
51269 // If we're splatting the lower half subvector of a full vector load into the
51270 // upper half, attempt to create a subvector broadcast.
51271 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
51272 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
51273 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
51274 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
51275 if (VecLd && SubLd &&
51276 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
51277 SubVec.getValueSizeInBits() / 8, 0))
51278 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
51279 SubLd, 0, DAG);
51280 }
51281
51282 return SDValue();
51283}
51284
51285/// If we are extracting a subvector of a vector select and the select condition
51286/// is composed of concatenated vectors, try to narrow the select width. This
51287/// is a common pattern for AVX1 integer code because 256-bit selects may be
51288/// legal, but there is almost no integer math/logic available for 256-bit.
51289/// This function should only be called with legal types (otherwise, the calls
51290/// to get simple value types will assert).
51291static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
51292 SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
51293 SmallVector<SDValue, 4> CatOps;
51294 if (Sel.getOpcode() != ISD::VSELECT ||
51295 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
51296 return SDValue();
51297
51298 // Note: We assume simple value types because this should only be called with
51299 // legal operations/types.
51300 // TODO: This can be extended to handle extraction to 256-bits.
51301 MVT VT = Ext->getSimpleValueType(0);
51302 if (!VT.is128BitVector())
51303 return SDValue();
51304
51305 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
51306 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
51307 return SDValue();
51308
51309 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
51310 MVT SelVT = Sel.getSimpleValueType();
51311 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51312, __extension__ __PRETTY_FUNCTION__))
51312 "Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51312, __extension__ __PRETTY_FUNCTION__))
;
51313
51314 unsigned SelElts = SelVT.getVectorNumElements();
51315 unsigned CastedElts = WideVT.getVectorNumElements();
51316 unsigned ExtIdx = Ext->getConstantOperandVal(1);
51317 if (SelElts % CastedElts == 0) {
51318 // The select has the same or more (narrower) elements than the extract
51319 // operand. The extraction index gets scaled by that factor.
51320 ExtIdx *= (SelElts / CastedElts);
51321 } else if (CastedElts % SelElts == 0) {
51322 // The select has less (wider) elements than the extract operand. Make sure
51323 // that the extraction index can be divided evenly.
51324 unsigned IndexDivisor = CastedElts / SelElts;
51325 if (ExtIdx % IndexDivisor != 0)
51326 return SDValue();
51327 ExtIdx /= IndexDivisor;
51328 } else {
51329 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51329)
;
51330 }
51331
51332 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
51333 unsigned NarrowElts = SelElts / NarrowingFactor;
51334 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
51335 SDLoc DL(Ext);
51336 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
51337 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
51338 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
51339 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
51340 return DAG.getBitcast(VT, NarrowSel);
51341}
51342
51343static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
51344 TargetLowering::DAGCombinerInfo &DCI,
51345 const X86Subtarget &Subtarget) {
51346 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
51347 // eventually get combined/lowered into ANDNP) with a concatenated operand,
51348 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
51349 // We let generic combining take over from there to simplify the
51350 // insert/extract and 'not'.
51351 // This pattern emerges during AVX1 legalization. We handle it before lowering
51352 // to avoid complications like splitting constant vector loads.
51353
51354 // Capture the original wide type in the likely case that we need to bitcast
51355 // back to this type.
51356 if (!N->getValueType(0).isSimple())
51357 return SDValue();
51358
51359 MVT VT = N->getSimpleValueType(0);
51360 SDValue InVec = N->getOperand(0);
51361 unsigned IdxVal = N->getConstantOperandVal(1);
51362 SDValue InVecBC = peekThroughBitcasts(InVec);
51363 EVT InVecVT = InVec.getValueType();
51364 unsigned SizeInBits = VT.getSizeInBits();
51365 unsigned InSizeInBits = InVecVT.getSizeInBits();
51366 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51367
51368 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
51369 TLI.isTypeLegal(InVecVT) &&
51370 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
51371 auto isConcatenatedNot = [](SDValue V) {
51372 V = peekThroughBitcasts(V);
51373 if (!isBitwiseNot(V))
51374 return false;
51375 SDValue NotOp = V->getOperand(0);
51376 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
51377 };
51378 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
51379 isConcatenatedNot(InVecBC.getOperand(1))) {
51380 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
51381 SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
51382 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
51383 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
51384 }
51385 }
51386
51387 if (DCI.isBeforeLegalizeOps())
51388 return SDValue();
51389
51390 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
51391 return V;
51392
51393 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
51394 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
51395
51396 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
51397 if (VT.getScalarType() == MVT::i1)
51398 return DAG.getConstant(1, SDLoc(N), VT);
51399 return getOnesVector(VT, DAG, SDLoc(N));
51400 }
51401
51402 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
51403 return DAG.getBuildVector(
51404 VT, SDLoc(N),
51405 InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
51406
51407 // If we are extracting from an insert into a zero vector, replace with a
51408 // smaller insert into zero if we don't access less than the original
51409 // subvector. Don't do this for i1 vectors.
51410 if (VT.getVectorElementType() != MVT::i1 &&
51411 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
51412 InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
51413 ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
51414 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
51415 SDLoc DL(N);
51416 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
51417 getZeroVector(VT, Subtarget, DAG, DL),
51418 InVec.getOperand(1), InVec.getOperand(2));
51419 }
51420
51421 // If we're extracting an upper subvector from a broadcast we should just
51422 // extract the lowest subvector instead which should allow
51423 // SimplifyDemandedVectorElts do more simplifications.
51424 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
51425 InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
51426 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
51427 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
51428
51429 // If we're extracting a broadcasted subvector, just use the lowest subvector.
51430 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
51431 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
51432 return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);
51433
51434 // Attempt to extract from the source of a shuffle vector.
51435 if ((InSizeInBits % SizeInBits) == 0 &&
51436 (IdxVal % VT.getVectorNumElements()) == 0) {
51437 SmallVector<int, 32> ShuffleMask;
51438 SmallVector<int, 32> ScaledMask;
51439 SmallVector<SDValue, 2> ShuffleInputs;
51440 unsigned NumSubVecs = InSizeInBits / SizeInBits;
51441 // Decode the shuffle mask and scale it so its shuffling subvectors.
51442 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
51443 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
51444 unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
51445 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
51446 return DAG.getUNDEF(VT);
51447 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
51448 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
51449 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
51450 if (Src.getValueSizeInBits() == InSizeInBits) {
51451 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
51452 unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
51453 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
51454 SDLoc(N), SizeInBits);
51455 }
51456 }
51457 }
51458
51459 // If we're extracting the lowest subvector and we're the only user,
51460 // we may be able to perform this with a smaller vector width.
51461 unsigned InOpcode = InVec.getOpcode();
51462 if (IdxVal == 0 && InVec.hasOneUse()) {
51463 if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
51464 // v2f64 CVTDQ2PD(v4i32).
51465 if (InOpcode == ISD::SINT_TO_FP &&
51466 InVec.getOperand(0).getValueType() == MVT::v4i32) {
51467 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
51468 }
51469 // v2f64 CVTUDQ2PD(v4i32).
51470 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
51471 InVec.getOperand(0).getValueType() == MVT::v4i32) {
51472 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
51473 }
51474 // v2f64 CVTPS2PD(v4f32).
51475 if (InOpcode == ISD::FP_EXTEND &&
51476 InVec.getOperand(0).getValueType() == MVT::v4f32) {
51477 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
51478 }
51479 }
51480 if ((InOpcode == ISD::ANY_EXTEND ||
51481 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
51482 InOpcode == ISD::ZERO_EXTEND ||
51483 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
51484 InOpcode == ISD::SIGN_EXTEND ||
51485 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
51486 (SizeInBits == 128 || SizeInBits == 256) &&
51487 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
51488 SDLoc DL(N);
51489 SDValue Ext = InVec.getOperand(0);
51490 if (Ext.getValueSizeInBits() > SizeInBits)
51491 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
51492 unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
51493 return DAG.getNode(ExtOp, DL, VT, Ext);
51494 }
51495 if (InOpcode == ISD::VSELECT &&
51496 InVec.getOperand(0).getValueType().is256BitVector() &&
51497 InVec.getOperand(1).getValueType().is256BitVector() &&
51498 InVec.getOperand(2).getValueType().is256BitVector()) {
51499 SDLoc DL(N);
51500 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
51501 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
51502 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
51503 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
51504 }
51505 if (InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
51506 (VT.is128BitVector() || VT.is256BitVector())) {
51507 SDLoc DL(N);
51508 SDValue InVecSrc = InVec.getOperand(0);
51509 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
51510 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
51511 return DAG.getNode(InOpcode, DL, VT, Ext);
51512 }
51513 }
51514
51515 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
51516 // as this is very likely to fold into a shuffle/truncation.
51517 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
51518 InVecVT.getScalarSizeInBits() == 64 &&
51519 InVec.getConstantOperandAPInt(1) == 32) {
51520 SDLoc DL(N);
51521 SDValue Ext =
51522 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
51523 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
51524 }
51525
51526 return SDValue();
51527}
51528
51529static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
51530 EVT VT = N->getValueType(0);
51531 SDValue Src = N->getOperand(0);
51532 SDLoc DL(N);
51533
51534 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
51535 // This occurs frequently in our masked scalar intrinsic code and our
51536 // floating point select lowering with AVX512.
51537 // TODO: SimplifyDemandedBits instead?
51538 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
51539 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
51540 if (C->getAPIntValue().isOneValue())
51541 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
51542 Src.getOperand(0));
51543
51544 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
51545 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
51546 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
51547 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
51548 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
51549 if (C->isNullValue())
51550 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
51551 Src.getOperand(1));
51552
51553 // Reduce v2i64 to v4i32 if we don't need the upper bits.
51554 // TODO: Move to DAGCombine/SimplifyDemandedBits?
51555 if (VT == MVT::v2i64 || VT == MVT::v2f64) {
51556 auto IsAnyExt64 = [](SDValue Op) {
51557 if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
51558 return SDValue();
51559 if (Op.getOpcode() == ISD::ANY_EXTEND &&
51560 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
51561 return Op.getOperand(0);
51562 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
51563 if (Ld->getExtensionType() == ISD::EXTLOAD &&
51564 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
51565 return Op;
51566 return SDValue();
51567 };
51568 if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
51569 return DAG.getBitcast(
51570 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
51571 DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
51572 }
51573
51574 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
51575 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
51576 Src.getOperand(0).getValueType() == MVT::x86mmx)
51577 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
51578
51579 // See if we're broadcasting the scalar value, in which case just reuse that.
51580 // Ensure the same SDValue from the SDNode use is being used.
51581 if (VT.getScalarType() == Src.getValueType())
51582 for (SDNode *User : Src->uses())
51583 if (User->getOpcode() == X86ISD::VBROADCAST &&
51584 Src == User->getOperand(0)) {
51585 unsigned SizeInBits = VT.getFixedSizeInBits();
51586 unsigned BroadcastSizeInBits =
51587 User->getValueSizeInBits(0).getFixedSize();
51588 if (BroadcastSizeInBits == SizeInBits)
51589 return SDValue(User, 0);
51590 if (BroadcastSizeInBits > SizeInBits)
51591 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
51592 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
51593 // coverage.
51594 }
51595
51596 return SDValue();
51597}
51598
51599// Simplify PMULDQ and PMULUDQ operations.
51600static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
51601 TargetLowering::DAGCombinerInfo &DCI,
51602 const X86Subtarget &Subtarget) {
51603 SDValue LHS = N->getOperand(0);
51604 SDValue RHS = N->getOperand(1);
51605
51606 // Canonicalize constant to RHS.
51607 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
51608 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
51609 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
51610
51611 // Multiply by zero.
51612 // Don't return RHS as it may contain UNDEFs.
51613 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
51614 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
51615
51616 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
51617 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51618 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
51619 return SDValue(N, 0);
51620
51621 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
51622 // convert it to any_extend_invec, due to the LegalOperations check, do the
51623 // conversion directly to a vector shuffle manually. This exposes combine
51624 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
51625 // combineX86ShufflesRecursively on SSE4.1 targets.
51626 // FIXME: This is basically a hack around several other issues related to
51627 // ANY_EXTEND_VECTOR_INREG.
51628 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
51629 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
51630 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
51631 LHS.getOperand(0).getValueType() == MVT::v4i32) {
51632 SDLoc dl(N);
51633 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
51634 LHS.getOperand(0), { 0, -1, 1, -1 });
51635 LHS = DAG.getBitcast(MVT::v2i64, LHS);
51636 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
51637 }
51638 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
51639 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
51640 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
51641 RHS.getOperand(0).getValueType() == MVT::v4i32) {
51642 SDLoc dl(N);
51643 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
51644 RHS.getOperand(0), { 0, -1, 1, -1 });
51645 RHS = DAG.getBitcast(MVT::v2i64, RHS);
51646 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
51647 }
51648
51649 return SDValue();
51650}
51651
51652static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
51653 TargetLowering::DAGCombinerInfo &DCI,
51654 const X86Subtarget &Subtarget) {
51655 EVT VT = N->getValueType(0);
51656 SDValue In = N->getOperand(0);
51657 unsigned Opcode = N->getOpcode();
51658 unsigned InOpcode = In.getOpcode();
51659 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51660
51661 // Try to merge vector loads and extend_inreg to an extload.
51662 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
51663 In.hasOneUse()) {
51664 auto *Ld = cast<LoadSDNode>(In);
51665 if (Ld->isSimple()) {
51666 MVT SVT = In.getSimpleValueType().getVectorElementType();
51667 ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG
51668 ? ISD::SEXTLOAD
51669 : ISD::ZEXTLOAD;
51670 EVT MemVT = VT.changeVectorElementType(SVT);
51671 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
51672 SDValue Load =
51673 DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
51674 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
51675 Ld->getMemOperand()->getFlags());
51676 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
51677 return Load;
51678 }
51679 }
51680 }
51681
51682 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
51683 if (Opcode == InOpcode)
51684 return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0));
51685
51686 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
51687 // -> EXTEND_VECTOR_INREG(X).
51688 // TODO: Handle non-zero subvector indices.
51689 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
51690 In.getOperand(0).getOpcode() == getOpcode_EXTEND(Opcode) &&
51691 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
51692 In.getValueSizeInBits())
51693 return DAG.getNode(Opcode, SDLoc(N), VT, In.getOperand(0).getOperand(0));
51694
51695 // Attempt to combine as a shuffle.
51696 // TODO: General ZERO_EXTEND_VECTOR_INREG support.
51697 if (Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||
51698 (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && Subtarget.hasSSE41())) {
51699 SDValue Op(N, 0);
51700 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
51701 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51702 return Res;
51703 }
51704
51705 return SDValue();
51706}
51707
51708static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
51709 TargetLowering::DAGCombinerInfo &DCI) {
51710 EVT VT = N->getValueType(0);
51711
51712 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
51713 return DAG.getConstant(0, SDLoc(N), VT);
51714
51715 APInt KnownUndef, KnownZero;
51716 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51717 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
51718 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
51719 KnownZero, DCI))
51720 return SDValue(N, 0);
51721
51722 return SDValue();
51723}
51724
51725// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
51726// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
51727// extra instructions between the conversion due to going to scalar and back.
51728static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
51729 const X86Subtarget &Subtarget) {
51730 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
51731 return SDValue();
51732
51733 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
51734 return SDValue();
51735
51736 if (N->getValueType(0) != MVT::f32 ||
51737 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
51738 return SDValue();
51739
51740 SDLoc dl(N);
51741 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
51742 N->getOperand(0).getOperand(0));
51743 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
51744 DAG.getTargetConstant(4, dl, MVT::i32));
51745 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
51746 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
51747 DAG.getIntPtrConstant(0, dl));
51748}
51749
51750static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
51751 const X86Subtarget &Subtarget) {
51752 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
51753 return SDValue();
51754
51755 if (Subtarget.hasFP16())
51756 return SDValue();
51757
51758 bool IsStrict = N->isStrictFPOpcode();
51759 EVT VT = N->getValueType(0);
51760 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
51761 EVT SrcVT = Src.getValueType();
51762
51763 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
51764 return SDValue();
51765
51766 if (VT.getVectorElementType() != MVT::f32 &&
51767 VT.getVectorElementType() != MVT::f64)
51768 return SDValue();
51769
51770 unsigned NumElts = VT.getVectorNumElements();
51771 if (NumElts == 1 || !isPowerOf2_32(NumElts))
51772 return SDValue();
51773
51774 SDLoc dl(N);
51775
51776 // Convert the input to vXi16.
51777 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
51778 Src = DAG.getBitcast(IntVT, Src);
51779
51780 // Widen to at least 8 input elements.
51781 if (NumElts < 8) {
51782 unsigned NumConcats = 8 / NumElts;
51783 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
51784 : DAG.getConstant(0, dl, IntVT);
51785 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
51786 Ops[0] = Src;
51787 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
51788 }
51789
51790 // Destination is vXf32 with at least 4 elements.
51791 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
51792 std::max(4U, NumElts));
51793 SDValue Cvt, Chain;
51794 if (IsStrict) {
51795 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
51796 {N->getOperand(0), Src});
51797 Chain = Cvt.getValue(1);
51798 } else {
51799 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
51800 }
51801
51802 if (NumElts < 4) {
51803 assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51803, __extension__ __PRETTY_FUNCTION__))
;
51804 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
51805 DAG.getIntPtrConstant(0, dl));
51806 }
51807
51808 if (IsStrict) {
51809 // Extend to the original VT if necessary.
51810 if (Cvt.getValueType() != VT) {
51811 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
51812 {Chain, Cvt});
51813 Chain = Cvt.getValue(1);
51814 }
51815 return DAG.getMergeValues({Cvt, Chain}, dl);
51816 }
51817
51818 // Extend to the original VT if necessary.
51819 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
51820}
51821
51822// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
51823// from. Limit this to cases where the loads have the same input chain and the
51824// output chains are unused. This avoids any memory ordering issues.
51825static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
51826 TargetLowering::DAGCombinerInfo &DCI) {
51827 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51829, __extension__ __PRETTY_FUNCTION__))
51828 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51829, __extension__ __PRETTY_FUNCTION__))
51829 "Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51829, __extension__ __PRETTY_FUNCTION__))
;
51830
51831 // Only do this if the chain result is unused.
51832 if (N->hasAnyUseOfValue(1))
51833 return SDValue();
51834
51835 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
51836
51837 SDValue Ptr = MemIntrin->getBasePtr();
51838 SDValue Chain = MemIntrin->getChain();
51839 EVT VT = N->getSimpleValueType(0);
51840 EVT MemVT = MemIntrin->getMemoryVT();
51841
51842 // Look at other users of our base pointer and try to find a wider broadcast.
51843 // The input chain and the size of the memory VT must match.
51844 for (SDNode *User : Ptr->uses())
51845 if (User != N && User->getOpcode() == N->getOpcode() &&
51846 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
51847 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
51848 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
51849 MemVT.getSizeInBits() &&
51850 !User->hasAnyUseOfValue(1) &&
51851 User->getValueSizeInBits(0).getFixedSize() > VT.getFixedSizeInBits()) {
51852 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
51853 VT.getSizeInBits());
51854 Extract = DAG.getBitcast(VT, Extract);
51855 return DCI.CombineTo(N, Extract, SDValue(User, 1));
51856 }
51857
51858 return SDValue();
51859}
51860
51861static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
51862 const X86Subtarget &Subtarget) {
51863 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
51864 return SDValue();
51865
51866 if (Subtarget.hasFP16())
51867 return SDValue();
51868
51869 EVT VT = N->getValueType(0);
51870 SDValue Src = N->getOperand(0);
51871 EVT SrcVT = Src.getValueType();
51872
51873 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
51874 SrcVT.getVectorElementType() != MVT::f32)
51875 return SDValue();
51876
51877 unsigned NumElts = VT.getVectorNumElements();
51878 if (NumElts == 1 || !isPowerOf2_32(NumElts))
51879 return SDValue();
51880
51881 SDLoc dl(N);
51882
51883 // Widen to at least 4 input elements.
51884 if (NumElts < 4)
51885 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
51886 DAG.getConstantFP(0.0, dl, SrcVT));
51887
51888 // Destination is v8i16 with at least 8 elements.
51889 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51890 std::max(8U, NumElts));
51891 SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
51892 DAG.getTargetConstant(4, dl, MVT::i32));
51893
51894 // Extract down to real number of elements.
51895 if (NumElts < 8) {
51896 EVT IntVT = VT.changeVectorElementTypeToInteger();
51897 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
51898 DAG.getIntPtrConstant(0, dl));
51899 }
51900
51901 return DAG.getBitcast(VT, Cvt);
51902}
51903
51904static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
51905 SDValue Src = N->getOperand(0);
51906
51907 // Turn MOVDQ2Q+simple_load into an mmx load.
51908 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
51909 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
51910
51911 if (LN->isSimple()) {
51912 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
51913 LN->getBasePtr(),
51914 LN->getPointerInfo(),
51915 LN->getOriginalAlign(),
51916 LN->getMemOperand()->getFlags());
51917 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
51918 return NewLd;
51919 }
51920 }
51921
51922 return SDValue();
51923}
51924
51925static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
51926 TargetLowering::DAGCombinerInfo &DCI) {
51927 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
51928 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51929 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
51930 APInt::getAllOnesValue(NumBits), DCI))
51931 return SDValue(N, 0);
51932
51933 return SDValue();
51934}
51935
51936SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
51937 DAGCombinerInfo &DCI) const {
51938 SelectionDAG &DAG = DCI.DAG;
51939 switch (N->getOpcode()) {
51940 default: break;
51941 case ISD::SCALAR_TO_VECTOR:
51942 return combineScalarToVector(N, DAG);
51943 case ISD::EXTRACT_VECTOR_ELT:
51944 case X86ISD::PEXTRW:
51945 case X86ISD::PEXTRB:
51946 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
51947 case ISD::CONCAT_VECTORS:
51948 return combineConcatVectors(N, DAG, DCI, Subtarget);
51949 case ISD::INSERT_SUBVECTOR:
51950 return combineInsertSubvector(N, DAG, DCI, Subtarget);
51951 case ISD::EXTRACT_SUBVECTOR:
51952 return combineExtractSubvector(N, DAG, DCI, Subtarget);
51953 case ISD::VSELECT:
51954 case ISD::SELECT:
51955 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
51956 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
51957 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
51958 case X86ISD::CMP: return combineCMP(N, DAG);
51959 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
51960 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
51961 case X86ISD::ADD:
51962 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
51963 case X86ISD::SBB: return combineSBB(N, DAG);
51964 case X86ISD::ADC: return combineADC(N, DAG, DCI);
51965 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
51966 case ISD::SHL: return combineShiftLeft(N, DAG);
51967 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
51968 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
51969 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
51970 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
51971 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
51972 case X86ISD::BEXTR:
51973 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
51974 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
51975 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
51976 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
51977 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
51978 case X86ISD::VEXTRACT_STORE:
51979 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
51980 case ISD::SINT_TO_FP:
51981 case ISD::STRICT_SINT_TO_FP:
51982 return combineSIntToFP(N, DAG, DCI, Subtarget);
51983 case ISD::UINT_TO_FP:
51984 case ISD::STRICT_UINT_TO_FP:
51985 return combineUIntToFP(N, DAG, Subtarget);
51986 case ISD::FADD:
51987 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
51988 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
51989 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
51990 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
51991 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
51992 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
51993 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
51994 case X86ISD::FXOR:
51995 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
51996 case X86ISD::FMIN:
51997 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
51998 case ISD::FMINNUM:
51999 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
52000 case X86ISD::CVTSI2P:
52001 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
52002 case X86ISD::CVTP2SI:
52003 case X86ISD::CVTP2UI:
52004 case X86ISD::STRICT_CVTTP2SI:
52005 case X86ISD::CVTTP2SI:
52006 case X86ISD::STRICT_CVTTP2UI:
52007 case X86ISD::CVTTP2UI:
52008 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
52009 case X86ISD::STRICT_CVTPH2PS:
52010 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
52011 case X86ISD::BT: return combineBT(N, DAG, DCI);
52012 case ISD::ANY_EXTEND:
52013 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
52014 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
52015 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
52016 case ISD::ANY_EXTEND_VECTOR_INREG:
52017 case ISD::SIGN_EXTEND_VECTOR_INREG:
52018 case ISD::ZERO_EXTEND_VECTOR_INREG:
52019 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
52020 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
52021 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
52022 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
52023 case X86ISD::PACKSS:
52024 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
52025 case X86ISD::HADD:
52026 case X86ISD::HSUB:
52027 case X86ISD::FHADD:
52028 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
52029 case X86ISD::VSHL:
52030 case X86ISD::VSRA:
52031 case X86ISD::VSRL:
52032 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
52033 case X86ISD::VSHLI:
52034 case X86ISD::VSRAI:
52035 case X86ISD::VSRLI:
52036 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
52037 case ISD::INSERT_VECTOR_ELT:
52038 case X86ISD::PINSRB:
52039 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
52040 case X86ISD::SHUFP: // Handle all target specific shuffles
52041 case X86ISD::INSERTPS:
52042 case X86ISD::EXTRQI:
52043 case X86ISD::INSERTQI:
52044 case X86ISD::VALIGN:
52045 case X86ISD::PALIGNR:
52046 case X86ISD::VSHLDQ:
52047 case X86ISD::VSRLDQ:
52048 case X86ISD::BLENDI:
52049 case X86ISD::UNPCKH:
52050 case X86ISD::UNPCKL:
52051 case X86ISD::MOVHLPS:
52052 case X86ISD::MOVLHPS:
52053 case X86ISD::PSHUFB:
52054 case X86ISD::PSHUFD:
52055 case X86ISD::PSHUFHW:
52056 case X86ISD::PSHUFLW:
52057 case X86ISD::MOVSHDUP:
52058 case X86ISD::MOVSLDUP:
52059 case X86ISD::MOVDDUP:
52060 case X86ISD::MOVSS:
52061 case X86ISD::MOVSD:
52062 case X86ISD::MOVSH:
52063 case X86ISD::VBROADCAST:
52064 case X86ISD::VPPERM:
52065 case X86ISD::VPERMI:
52066 case X86ISD::VPERMV:
52067 case X86ISD::VPERMV3:
52068 case X86ISD::VPERMIL2:
52069 case X86ISD::VPERMILPI:
52070 case X86ISD::VPERMILPV:
52071 case X86ISD::VPERM2X128:
52072 case X86ISD::SHUF128:
52073 case X86ISD::VZEXT_MOVL:
52074 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
52075 case X86ISD::FMADD_RND:
52076 case X86ISD::FMSUB:
52077 case X86ISD::STRICT_FMSUB:
52078 case X86ISD::FMSUB_RND:
52079 case X86ISD::FNMADD:
52080 case X86ISD::STRICT_FNMADD:
52081 case X86ISD::FNMADD_RND:
52082 case X86ISD::FNMSUB:
52083 case X86ISD::STRICT_FNMSUB:
52084 case X86ISD::FNMSUB_RND:
52085 case ISD::FMA:
52086 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
52087 case X86ISD::FMADDSUB_RND:
52088 case X86ISD::FMSUBADD_RND:
52089 case X86ISD::FMADDSUB:
52090 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
52091 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
52092 case X86ISD::MGATHER:
52093 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
52094 case ISD::MGATHER:
52095 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
52096 case X86ISD::PCMPEQ:
52097 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
52098 case X86ISD::PMULDQ:
52099 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
52100 case X86ISD::KSHIFTL:
52101 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
52102 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
52103 case ISD::STRICT_FP_EXTEND:
52104 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
52105 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
52106 case X86ISD::VBROADCAST_LOAD:
52107 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
52108 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
52109 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
52110 }
52111
52112 return SDValue();
52113}
52114
52115bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
52116 if (!isTypeLegal(VT))
52117 return false;
52118
52119 // There are no vXi8 shifts.
52120 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
52121 return false;
52122
52123 // TODO: Almost no 8-bit ops are desirable because they have no actual
52124 // size/speed advantages vs. 32-bit ops, but they do have a major
52125 // potential disadvantage by causing partial register stalls.
52126 //
52127 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
52128 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
52129 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
52130 // check for a constant operand to the multiply.
52131 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
52132 return false;
52133
52134 // i16 instruction encodings are longer and some i16 instructions are slow,
52135 // so those are not desirable.
52136 if (VT == MVT::i16) {
52137 switch (Opc) {
52138 default:
52139 break;
52140 case ISD::LOAD:
52141 case ISD::SIGN_EXTEND:
52142 case ISD::ZERO_EXTEND:
52143 case ISD::ANY_EXTEND:
52144 case ISD::SHL:
52145 case ISD::SRA:
52146 case ISD::SRL:
52147 case ISD::SUB:
52148 case ISD::ADD:
52149 case ISD::MUL:
52150 case ISD::AND:
52151 case ISD::OR:
52152 case ISD::XOR:
52153 return false;
52154 }
52155 }
52156
52157 // Any legal type not explicitly accounted for above here is desirable.
52158 return true;
52159}
52160
52161SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
52162 SDValue Value, SDValue Addr,
52163 SelectionDAG &DAG) const {
52164 const Module *M = DAG.getMachineFunction().getMMI().getModule();
52165 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
52166 if (IsCFProtectionSupported) {
52167 // In case control-flow branch protection is enabled, we need to add
52168 // notrack prefix to the indirect branch.
52169 // In order to do that we create NT_BRIND SDNode.
52170 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
52171 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
52172 }
52173
52174 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
52175}
52176
52177bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
52178 EVT VT = Op.getValueType();
52179 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
52180 isa<ConstantSDNode>(Op.getOperand(1));
52181
52182 // i16 is legal, but undesirable since i16 instruction encodings are longer
52183 // and some i16 instructions are slow.
52184 // 8-bit multiply-by-constant can usually be expanded to something cheaper
52185 // using LEA and/or other ALU ops.
52186 if (VT != MVT::i16 && !Is8BitMulByConstant)
52187 return false;
52188
52189 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
52190 if (!Op.hasOneUse())
52191 return false;
52192 SDNode *User = *Op->use_begin();
52193 if (!ISD::isNormalStore(User))
52194 return false;
52195 auto *Ld = cast<LoadSDNode>(Load);
52196 auto *St = cast<StoreSDNode>(User);
52197 return Ld->getBasePtr() == St->getBasePtr();
52198 };
52199
52200 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
52201 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
52202 return false;
52203 if (!Op.hasOneUse())
52204 return false;
52205 SDNode *User = *Op->use_begin();
52206 if (User->getOpcode() != ISD::ATOMIC_STORE)
52207 return false;
52208 auto *Ld = cast<AtomicSDNode>(Load);
52209 auto *St = cast<AtomicSDNode>(User);
52210 return Ld->getBasePtr() == St->getBasePtr();
52211 };
52212
52213 bool Commute = false;
52214 switch (Op.getOpcode()) {
52215 default: return false;
52216 case ISD::SIGN_EXTEND:
52217 case ISD::ZERO_EXTEND:
52218 case ISD::ANY_EXTEND:
52219 break;
52220 case ISD::SHL:
52221 case ISD::SRA:
52222 case ISD::SRL: {
52223 SDValue N0 = Op.getOperand(0);
52224 // Look out for (store (shl (load), x)).
52225 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
52226 return false;
52227 break;
52228 }
52229 case ISD::ADD:
52230 case ISD::MUL:
52231 case ISD::AND:
52232 case ISD::OR:
52233 case ISD::XOR:
52234 Commute = true;
52235 LLVM_FALLTHROUGH[[gnu::fallthrough]];
52236 case ISD::SUB: {
52237 SDValue N0 = Op.getOperand(0);
52238 SDValue N1 = Op.getOperand(1);
52239 // Avoid disabling potential load folding opportunities.
52240 if (MayFoldLoad(N1) &&
52241 (!Commute || !isa<ConstantSDNode>(N0) ||
52242 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
52243 return false;
52244 if (MayFoldLoad(N0) &&
52245 ((Commute && !isa<ConstantSDNode>(N1)) ||
52246 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
52247 return false;
52248 if (IsFoldableAtomicRMW(N0, Op) ||
52249 (Commute && IsFoldableAtomicRMW(N1, Op)))
52250 return false;
52251 }
52252 }
52253
52254 PVT = MVT::i32;
52255 return true;
52256}
52257
52258//===----------------------------------------------------------------------===//
52259// X86 Inline Assembly Support
52260//===----------------------------------------------------------------------===//
52261
52262// Helper to match a string separated by whitespace.
52263static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
52264 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
52265
52266 for (StringRef Piece : Pieces) {
52267 if (!S.startswith(Piece)) // Check if the piece matches.
52268 return false;
52269
52270 S = S.substr(Piece.size());
52271 StringRef::size_type Pos = S.find_first_not_of(" \t");
52272 if (Pos == 0) // We matched a prefix.
52273 return false;
52274
52275 S = S.substr(Pos);
52276 }
52277
52278 return S.empty();
52279}
52280
52281static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
52282
52283 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
52284 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
52285 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
52286 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
52287
52288 if (AsmPieces.size() == 3)
52289 return true;
52290 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
52291 return true;
52292 }
52293 }
52294 return false;
52295}
52296
52297bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
52298 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
52299
52300 const std::string &AsmStr = IA->getAsmString();
52301
52302 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
52303 if (!Ty || Ty->getBitWidth() % 16 != 0)
52304 return false;
52305
52306 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
52307 SmallVector<StringRef, 4> AsmPieces;
52308 SplitString(AsmStr, AsmPieces, ";\n");
52309
52310 switch (AsmPieces.size()) {
52311 default: return false;
52312 case 1:
52313 // FIXME: this should verify that we are targeting a 486 or better. If not,
52314 // we will turn this bswap into something that will be lowered to logical
52315 // ops instead of emitting the bswap asm. For now, we don't support 486 or
52316 // lower so don't worry about this.
52317 // bswap $0
52318 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
52319 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
52320 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
52321 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
52322 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
52323 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
52324 // No need to check constraints, nothing other than the equivalent of
52325 // "=r,0" would be valid here.
52326 return IntrinsicLowering::LowerToByteSwap(CI);
52327 }
52328
52329 // rorw $$8, ${0:w} --> llvm.bswap.i16
52330 if (CI->getType()->isIntegerTy(16) &&
52331 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
52332 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
52333 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
52334 AsmPieces.clear();
52335 StringRef ConstraintsStr = IA->getConstraintString();
52336 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
52337 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
52338 if (clobbersFlagRegisters(AsmPieces))
52339 return IntrinsicLowering::LowerToByteSwap(CI);
52340 }
52341 break;
52342 case 3:
52343 if (CI->getType()->isIntegerTy(32) &&
52344 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
52345 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
52346 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
52347 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
52348 AsmPieces.clear();
52349 StringRef ConstraintsStr = IA->getConstraintString();
52350 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
52351 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
52352 if (clobbersFlagRegisters(AsmPieces))
52353 return IntrinsicLowering::LowerToByteSwap(CI);
52354 }
52355
52356 if (CI->getType()->isIntegerTy(64)) {
52357 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
52358 if (Constraints.size() >= 2 &&
52359 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
52360 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
52361 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
52362 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
52363 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
52364 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
52365 return IntrinsicLowering::LowerToByteSwap(CI);
52366 }
52367 }
52368 break;
52369 }
52370 return false;
52371}
52372
52373static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
52374 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
52375 .Case("{@cca}", X86::COND_A)
52376 .Case("{@ccae}", X86::COND_AE)
52377 .Case("{@ccb}", X86::COND_B)
52378 .Case("{@ccbe}", X86::COND_BE)
52379 .Case("{@ccc}", X86::COND_B)
52380 .Case("{@cce}", X86::COND_E)
52381 .Case("{@ccz}", X86::COND_E)
52382 .Case("{@ccg}", X86::COND_G)
52383 .Case("{@ccge}", X86::COND_GE)
52384 .Case("{@ccl}", X86::COND_L)
52385 .Case("{@ccle}", X86::COND_LE)
52386 .Case("{@ccna}", X86::COND_BE)
52387 .Case("{@ccnae}", X86::COND_B)
52388 .Case("{@ccnb}", X86::COND_AE)
52389 .Case("{@ccnbe}", X86::COND_A)
52390 .Case("{@ccnc}", X86::COND_AE)
52391 .Case("{@ccne}", X86::COND_NE)
52392 .Case("{@ccnz}", X86::COND_NE)
52393 .Case("{@ccng}", X86::COND_LE)
52394 .Case("{@ccnge}", X86::COND_L)
52395 .Case("{@ccnl}", X86::COND_GE)
52396 .Case("{@ccnle}", X86::COND_G)
52397 .Case("{@ccno}", X86::COND_NO)
52398 .Case("{@ccnp}", X86::COND_NP)
52399 .Case("{@ccns}", X86::COND_NS)
52400 .Case("{@cco}", X86::COND_O)
52401 .Case("{@ccp}", X86::COND_P)
52402 .Case("{@ccs}", X86::COND_S)
52403 .Default(X86::COND_INVALID);
52404 return Cond;
52405}
52406
52407/// Given a constraint letter, return the type of constraint for this target.
52408X86TargetLowering::ConstraintType
52409X86TargetLowering::getConstraintType(StringRef Constraint) const {
52410 if (Constraint.size() == 1) {
52411 switch (Constraint[0]) {
52412 case 'R':
52413 case 'q':
52414 case 'Q':
52415 case 'f':
52416 case 't':
52417 case 'u':
52418 case 'y':
52419 case 'x':
52420 case 'v':
52421 case 'l':
52422 case 'k': // AVX512 masking registers.
52423 return C_RegisterClass;
52424 case 'a':
52425 case 'b':
52426 case 'c':
52427 case 'd':
52428 case 'S':
52429 case 'D':
52430 case 'A':
52431 return C_Register;
52432 case 'I':
52433 case 'J':
52434 case 'K':
52435 case 'N':
52436 case 'G':
52437 case 'L':
52438 case 'M':
52439 return C_Immediate;
52440 case 'C':
52441 case 'e':
52442 case 'Z':
52443 return C_Other;
52444 default:
52445 break;
52446 }
52447 }
52448 else if (Constraint.size() == 2) {
52449 switch (Constraint[0]) {
52450 default:
52451 break;
52452 case 'Y':
52453 switch (Constraint[1]) {
52454 default:
52455 break;
52456 case 'z':
52457 return C_Register;
52458 case 'i':
52459 case 'm':
52460 case 'k':
52461 case 't':
52462 case '2':
52463 return C_RegisterClass;
52464 }
52465 }
52466 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
52467 return C_Other;
52468 return TargetLowering::getConstraintType(Constraint);
52469}
52470
52471/// Examine constraint type and operand type and determine a weight value.
52472/// This object must already have been set up with the operand type
52473/// and the current alternative constraint selected.
52474TargetLowering::ConstraintWeight
52475 X86TargetLowering::getSingleConstraintMatchWeight(
52476 AsmOperandInfo &info, const char *constraint) const {
52477 ConstraintWeight weight = CW_Invalid;
52478 Value *CallOperandVal = info.CallOperandVal;
52479 // If we don't have a value, we can't do a match,
52480 // but allow it at the lowest weight.
52481 if (!CallOperandVal)
52482 return CW_Default;
52483 Type *type = CallOperandVal->getType();
52484 // Look at the constraint type.
52485 switch (*constraint) {
52486 default:
52487 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
52488 LLVM_FALLTHROUGH[[gnu::fallthrough]];
52489 case 'R':
52490 case 'q':
52491 case 'Q':
52492 case 'a':
52493 case 'b':
52494 case 'c':
52495 case 'd':
52496 case 'S':
52497 case 'D':
52498 case 'A':
52499 if (CallOperandVal->getType()->isIntegerTy())
52500 weight = CW_SpecificReg;
52501 break;
52502 case 'f':
52503 case 't':
52504 case 'u':
52505 if (type->isFloatingPointTy())
52506 weight = CW_SpecificReg;
52507 break;
52508 case 'y':
52509 if (type->isX86_MMXTy() && Subtarget.hasMMX())
52510 weight = CW_SpecificReg;
52511 break;
52512 case 'Y':
52513 if (StringRef(constraint).size() != 2)
52514 break;
52515 switch (constraint[1]) {
52516 default:
52517 return CW_Invalid;
52518 // XMM0
52519 case 'z':
52520 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
52521 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
52522 ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
52523 return CW_SpecificReg;
52524 return CW_Invalid;
52525 // Conditional OpMask regs (AVX512)
52526 case 'k':
52527 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
52528 return CW_Register;
52529 return CW_Invalid;
52530 // Any MMX reg
52531 case 'm':
52532 if (type->isX86_MMXTy() && Subtarget.hasMMX())
52533 return weight;
52534 return CW_Invalid;
52535 // Any SSE reg when ISA >= SSE2, same as 'x'
52536 case 'i':
52537 case 't':
52538 case '2':
52539 if (!Subtarget.hasSSE2())
52540 return CW_Invalid;
52541 break;
52542 }
52543 break;
52544 case 'v':
52545 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
52546 weight = CW_Register;
52547 LLVM_FALLTHROUGH[[gnu::fallthrough]];
52548 case 'x':
52549 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
52550 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
52551 weight = CW_Register;
52552 break;
52553 case 'k':
52554 // Enable conditional vector operations using %k<#> registers.
52555 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
52556 weight = CW_Register;
52557 break;
52558 case 'I':
52559 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
52560 if (C->getZExtValue() <= 31)
52561 weight = CW_Constant;
52562 }
52563 break;
52564 case 'J':
52565 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52566 if (C->getZExtValue() <= 63)
52567 weight = CW_Constant;
52568 }
52569 break;
52570 case 'K':
52571 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52572 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
52573 weight = CW_Constant;
52574 }
52575 break;
52576 case 'L':
52577 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52578 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
52579 weight = CW_Constant;
52580 }
52581 break;
52582 case 'M':
52583 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52584 if (C->getZExtValue() <= 3)
52585 weight = CW_Constant;
52586 }
52587 break;
52588 case 'N':
52589 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52590 if (C->getZExtValue() <= 0xff)
52591 weight = CW_Constant;
52592 }
52593 break;
52594 case 'G':
52595 case 'C':
52596 if (isa<ConstantFP>(CallOperandVal)) {
52597 weight = CW_Constant;
52598 }
52599 break;
52600 case 'e':
52601 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52602 if ((C->getSExtValue() >= -0x80000000LL) &&
52603 (C->getSExtValue() <= 0x7fffffffLL))
52604 weight = CW_Constant;
52605 }
52606 break;
52607 case 'Z':
52608 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
52609 if (C->getZExtValue() <= 0xffffffff)
52610 weight = CW_Constant;
52611 }
52612 break;
52613 }
52614 return weight;
52615}
52616
52617/// Try to replace an X constraint, which matches anything, with another that
52618/// has more specific requirements based on the type of the corresponding
52619/// operand.
52620const char *X86TargetLowering::
52621LowerXConstraint(EVT ConstraintVT) const {
52622 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
52623 // 'f' like normal targets.
52624 if (ConstraintVT.isFloatingPoint()) {
52625 if (Subtarget.hasSSE1())
52626 return "x";
52627 }
52628
52629 return TargetLowering::LowerXConstraint(ConstraintVT);
52630}
52631
52632// Lower @cc targets via setcc.
52633SDValue X86TargetLowering::LowerAsmOutputForConstraint(
52634 SDValue &Chain, SDValue &Flag, const SDLoc &DL,
52635 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
52636 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
52637 if (Cond == X86::COND_INVALID)
52638 return SDValue();
52639 // Check that return type is valid.
52640 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
52641 OpInfo.ConstraintVT.getSizeInBits() < 8)
52642 report_fatal_error("Flag output operand is of invalid type");
52643
52644 // Get EFLAGS register. Only update chain when copyfrom is glued.
52645 if (Flag.getNode()) {
52646 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
52647 Chain = Flag.getValue(1);
52648 } else
52649 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
52650 // Extract CC code.
52651 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
52652 // Extend to 32-bits
52653 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
52654
52655 return Result;
52656}
52657
52658/// Lower the specified operand into the Ops vector.
52659/// If it is invalid, don't add anything to Ops.
52660void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
52661 std::string &Constraint,
52662 std::vector<SDValue>&Ops,
52663 SelectionDAG &DAG) const {
52664 SDValue Result;
52665
52666 // Only support length 1 constraints for now.
52667 if (Constraint.length() > 1) return;
52668
52669 char ConstraintLetter = Constraint[0];
52670 switch (ConstraintLetter) {
52671 default: break;
52672 case 'I':
52673 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52674 if (C->getZExtValue() <= 31) {
52675 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52676 Op.getValueType());
52677 break;
52678 }
52679 }
52680 return;
52681 case 'J':
52682 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52683 if (C->getZExtValue() <= 63) {
52684 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52685 Op.getValueType());
52686 break;
52687 }
52688 }
52689 return;
52690 case 'K':
52691 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52692 if (isInt<8>(C->getSExtValue())) {
52693 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52694 Op.getValueType());
52695 break;
52696 }
52697 }
52698 return;
52699 case 'L':
52700 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52701 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
52702 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
52703 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
52704 Op.getValueType());
52705 break;
52706 }
52707 }
52708 return;
52709 case 'M':
52710 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52711 if (C->getZExtValue() <= 3) {
52712 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52713 Op.getValueType());
52714 break;
52715 }
52716 }
52717 return;
52718 case 'N':
52719 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52720 if (C->getZExtValue() <= 255) {
52721 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52722 Op.getValueType());
52723 break;
52724 }
52725 }
52726 return;
52727 case 'O':
52728 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52729 if (C->getZExtValue() <= 127) {
52730 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52731 Op.getValueType());
52732 break;
52733 }
52734 }
52735 return;
52736 case 'e': {
52737 // 32-bit signed value
52738 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52739 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
52740 C->getSExtValue())) {
52741 // Widen to 64 bits here to get it sign extended.
52742 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
52743 break;
52744 }
52745 // FIXME gcc accepts some relocatable values here too, but only in certain
52746 // memory models; it's complicated.
52747 }
52748 return;
52749 }
52750 case 'Z': {
52751 // 32-bit unsigned value
52752 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
52753 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
52754 C->getZExtValue())) {
52755 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
52756 Op.getValueType());
52757 break;
52758 }
52759 }
52760 // FIXME gcc accepts some relocatable values here too, but only in certain
52761 // memory models; it's complicated.
52762 return;
52763 }
52764 case 'i': {
52765 // Literal immediates are always ok.
52766 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
52767 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
52768 BooleanContent BCont = getBooleanContents(MVT::i64);
52769 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
52770 : ISD::SIGN_EXTEND;
52771 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
52772 : CST->getSExtValue();
52773 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
52774 break;
52775 }
52776
52777 // In any sort of PIC mode addresses need to be computed at runtime by
52778 // adding in a register or some sort of table lookup. These can't
52779 // be used as immediates.
52780 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
52781 return;
52782
52783 // If we are in non-pic codegen mode, we allow the address of a global (with
52784 // an optional displacement) to be used with 'i'.
52785 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
52786 // If we require an extra load to get this address, as in PIC mode, we
52787 // can't accept it.
52788 if (isGlobalStubReference(
52789 Subtarget.classifyGlobalReference(GA->getGlobal())))
52790 return;
52791 break;
52792 }
52793 }
52794
52795 if (Result.getNode()) {
52796 Ops.push_back(Result);
52797 return;
52798 }
52799 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
52800}
52801
52802/// Check if \p RC is a general purpose register class.
52803/// I.e., GR* or one of their variant.
52804static bool isGRClass(const TargetRegisterClass &RC) {
52805 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
52806 RC.hasSuperClassEq(&X86::GR16RegClass) ||
52807 RC.hasSuperClassEq(&X86::GR32RegClass) ||
52808 RC.hasSuperClassEq(&X86::GR64RegClass) ||
52809 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
52810}
52811
52812/// Check if \p RC is a vector register class.
52813/// I.e., FR* / VR* or one of their variant.
52814static bool isFRClass(const TargetRegisterClass &RC) {
52815 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
52816 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
52817 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
52818 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
52819 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
52820 RC.hasSuperClassEq(&X86::VR512RegClass);
52821}
52822
52823/// Check if \p RC is a mask register class.
52824/// I.e., VK* or one of their variant.
52825static bool isVKClass(const TargetRegisterClass &RC) {
52826 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
52827 RC.hasSuperClassEq(&X86::VK2RegClass) ||
52828 RC.hasSuperClassEq(&X86::VK4RegClass) ||
52829 RC.hasSuperClassEq(&X86::VK8RegClass) ||
52830 RC.hasSuperClassEq(&X86::VK16RegClass) ||
52831 RC.hasSuperClassEq(&X86::VK32RegClass) ||
52832 RC.hasSuperClassEq(&X86::VK64RegClass);
52833}
52834
52835std::pair<unsigned, const TargetRegisterClass *>
52836X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
52837 StringRef Constraint,
52838 MVT VT) const {
52839 // First, see if this is a constraint that directly corresponds to an LLVM
52840 // register class.
52841 if (Constraint.size() == 1) {
52842 // GCC Constraint Letters
52843 switch (Constraint[0]) {
52844 default: break;
52845 // 'A' means [ER]AX + [ER]DX.
52846 case 'A':
52847 if (Subtarget.is64Bit())
52848 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
52849 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52850, __extension__ __PRETTY_FUNCTION__))
52850 "Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52850, __extension__ __PRETTY_FUNCTION__))
;
52851 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
52852
52853 // TODO: Slight differences here in allocation order and leaving
52854 // RIP in the class. Do they matter any more here than they do
52855 // in the normal allocation?
52856 case 'k':
52857 if (Subtarget.hasAVX512()) {
52858 if (VT == MVT::i1)
52859 return std::make_pair(0U, &X86::VK1RegClass);
52860 if (VT == MVT::i8)
52861 return std::make_pair(0U, &X86::VK8RegClass);
52862 if (VT == MVT::i16)
52863 return std::make_pair(0U, &X86::VK16RegClass);
52864 }
52865 if (Subtarget.hasBWI()) {
52866 if (VT == MVT::i32)
52867 return std::make_pair(0U, &X86::VK32RegClass);
52868 if (VT == MVT::i64)
52869 return std::make_pair(0U, &X86::VK64RegClass);
52870 }
52871 break;
52872 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
52873 if (Subtarget.is64Bit()) {
52874 if (VT == MVT::i8 || VT == MVT::i1)
52875 return std::make_pair(0U, &X86::GR8RegClass);
52876 if (VT == MVT::i16)
52877 return std::make_pair(0U, &X86::GR16RegClass);
52878 if (VT == MVT::i32 || VT == MVT::f32)
52879 return std::make_pair(0U, &X86::GR32RegClass);
52880 if (VT != MVT::f80 && !VT.isVector())
52881 return std::make_pair(0U, &X86::GR64RegClass);
52882 break;
52883 }
52884 LLVM_FALLTHROUGH[[gnu::fallthrough]];
52885 // 32-bit fallthrough
52886 case 'Q': // Q_REGS
52887 if (VT == MVT::i8 || VT == MVT::i1)
52888 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
52889 if (VT == MVT::i16)
52890 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
52891 if (VT == MVT::i32 || VT == MVT::f32 ||
52892 (!VT.isVector() && !Subtarget.is64Bit()))
52893 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
52894 if (VT != MVT::f80 && !VT.isVector())
52895 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
52896 break;
52897 case 'r': // GENERAL_REGS
52898 case 'l': // INDEX_REGS
52899 if (VT == MVT::i8 || VT == MVT::i1)
52900 return std::make_pair(0U, &X86::GR8RegClass);
52901 if (VT == MVT::i16)
52902 return std::make_pair(0U, &X86::GR16RegClass);
52903 if (VT == MVT::i32 || VT == MVT::f32 ||
52904 (!VT.isVector() && !Subtarget.is64Bit()))
52905 return std::make_pair(0U, &X86::GR32RegClass);
52906 if (VT != MVT::f80 && !VT.isVector())
52907 return std::make_pair(0U, &X86::GR64RegClass);
52908 break;
52909 case 'R': // LEGACY_REGS
52910 if (VT == MVT::i8 || VT == MVT::i1)
52911 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
52912 if (VT == MVT::i16)
52913 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
52914 if (VT == MVT::i32 || VT == MVT::f32 ||
52915 (!VT.isVector() && !Subtarget.is64Bit()))
52916 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
52917 if (VT != MVT::f80 && !VT.isVector())
52918 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
52919 break;
52920 case 'f': // FP Stack registers.
52921 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
52922 // value to the correct fpstack register class.
52923 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
52924 return std::make_pair(0U, &X86::RFP32RegClass);
52925 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
52926 return std::make_pair(0U, &X86::RFP64RegClass);
52927 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
52928 return std::make_pair(0U, &X86::RFP80RegClass);
52929 break;
52930 case 'y': // MMX_REGS if MMX allowed.
52931 if (!Subtarget.hasMMX()) break;
52932 return std::make_pair(0U, &X86::VR64RegClass);
52933 case 'v':
52934 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
52935 if (!Subtarget.hasSSE1()) break;
52936 bool VConstraint = (Constraint[0] == 'v');
52937
52938 switch (VT.SimpleTy) {
52939 default: break;
52940 // Scalar SSE types.
52941 case MVT::f32:
52942 case MVT::i32:
52943 if (VConstraint && Subtarget.hasVLX())
52944 return std::make_pair(0U, &X86::FR32XRegClass);
52945 return std::make_pair(0U, &X86::FR32RegClass);
52946 case MVT::f64:
52947 case MVT::i64:
52948 if (VConstraint && Subtarget.hasVLX())
52949 return std::make_pair(0U, &X86::FR64XRegClass);
52950 return std::make_pair(0U, &X86::FR64RegClass);
52951 case MVT::i128:
52952 if (Subtarget.is64Bit()) {
52953 if (VConstraint && Subtarget.hasVLX())
52954 return std::make_pair(0U, &X86::VR128XRegClass);
52955 return std::make_pair(0U, &X86::VR128RegClass);
52956 }
52957 break;
52958 // Vector types and fp128.
52959 case MVT::f128:
52960 case MVT::v16i8:
52961 case MVT::v8i16:
52962 case MVT::v4i32:
52963 case MVT::v2i64:
52964 case MVT::v4f32:
52965 case MVT::v2f64:
52966 if (VConstraint && Subtarget.hasVLX())
52967 return std::make_pair(0U, &X86::VR128XRegClass);
52968 return std::make_pair(0U, &X86::VR128RegClass);
52969 // AVX types.
52970 case MVT::v32i8:
52971 case MVT::v16i16:
52972 case MVT::v8i32:
52973 case MVT::v4i64:
52974 case MVT::v8f32:
52975 case MVT::v4f64:
52976 if (VConstraint && Subtarget.hasVLX())
52977 return std::make_pair(0U, &X86::VR256XRegClass);
52978 if (Subtarget.hasAVX())
52979 return std::make_pair(0U, &X86::VR256RegClass);
52980 break;
52981 case MVT::v64i8:
52982 case MVT::v32i16:
52983 case MVT::v8f64:
52984 case MVT::v16f32:
52985 case MVT::v16i32:
52986 case MVT::v8i64:
52987 if (!Subtarget.hasAVX512()) break;
52988 if (VConstraint)
52989 return std::make_pair(0U, &X86::VR512RegClass);
52990 return std::make_pair(0U, &X86::VR512_0_15RegClass);
52991 }
52992 break;
52993 }
52994 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
52995 switch (Constraint[1]) {
52996 default:
52997 break;
52998 case 'i':
52999 case 't':
53000 case '2':
53001 return getRegForInlineAsmConstraint(TRI, "x", VT);
53002 case 'm':
53003 if (!Subtarget.hasMMX()) break;
53004 return std::make_pair(0U, &X86::VR64RegClass);
53005 case 'z':
53006 if (!Subtarget.hasSSE1()) break;
53007 switch (VT.SimpleTy) {
53008 default: break;
53009 // Scalar SSE types.
53010 case MVT::f32:
53011 case MVT::i32:
53012 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
53013 case MVT::f64:
53014 case MVT::i64:
53015 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
53016 case MVT::f128:
53017 case MVT::v16i8:
53018 case MVT::v8i16:
53019 case MVT::v4i32:
53020 case MVT::v2i64:
53021 case MVT::v4f32:
53022 case MVT::v2f64:
53023 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
53024 // AVX types.
53025 case MVT::v32i8:
53026 case MVT::v16i16:
53027 case MVT::v8i32:
53028 case MVT::v4i64:
53029 case MVT::v8f32:
53030 case MVT::v4f64:
53031 if (Subtarget.hasAVX())
53032 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
53033 break;
53034 case MVT::v64i8:
53035 case MVT::v32i16:
53036 case MVT::v8f64:
53037 case MVT::v16f32:
53038 case MVT::v16i32:
53039 case MVT::v8i64:
53040 if (Subtarget.hasAVX512())
53041 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
53042 break;
53043 }
53044 break;
53045 case 'k':
53046 // This register class doesn't allocate k0 for masked vector operation.
53047 if (Subtarget.hasAVX512()) {
53048 if (VT == MVT::i1)
53049 return std::make_pair(0U, &X86::VK1WMRegClass);
53050 if (VT == MVT::i8)
53051 return std::make_pair(0U, &X86::VK8WMRegClass);
53052 if (VT == MVT::i16)
53053 return std::make_pair(0U, &X86::VK16WMRegClass);
53054 }
53055 if (Subtarget.hasBWI()) {
53056 if (VT == MVT::i32)
53057 return std::make_pair(0U, &X86::VK32WMRegClass);
53058 if (VT == MVT::i64)
53059 return std::make_pair(0U, &X86::VK64WMRegClass);
53060 }
53061 break;
53062 }
53063 }
53064
53065 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
53066 return std::make_pair(0U, &X86::GR32RegClass);
53067
53068 // Use the default implementation in TargetLowering to convert the register
53069 // constraint into a member of a register class.
53070 std::pair<Register, const TargetRegisterClass*> Res;
53071 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
53072
53073 // Not found as a standard register?
53074 if (!Res.second) {
53075 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
53076 // to/from f80.
53077 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
53078 // Map st(0) -> st(7) -> ST0
53079 if (Constraint.size() == 7 && Constraint[0] == '{' &&
53080 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
53081 Constraint[3] == '(' &&
53082 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
53083 Constraint[5] == ')' && Constraint[6] == '}') {
53084 // st(7) is not allocatable and thus not a member of RFP80. Return
53085 // singleton class in cases where we have a reference to it.
53086 if (Constraint[4] == '7')
53087 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
53088 return std::make_pair(X86::FP0 + Constraint[4] - '0',
53089 &X86::RFP80RegClass);
53090 }
53091
53092 // GCC allows "st(0)" to be called just plain "st".
53093 if (StringRef("{st}").equals_insensitive(Constraint))
53094 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
53095 }
53096
53097 // flags -> EFLAGS
53098 if (StringRef("{flags}").equals_insensitive(Constraint))
53099 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
53100
53101 // dirflag -> DF
53102 // Only allow for clobber.
53103 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
53104 VT == MVT::Other)
53105 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
53106
53107 // fpsr -> FPSW
53108 if (StringRef("{fpsr}").equals_insensitive(Constraint))
53109 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
53110
53111 return Res;
53112 }
53113
53114 // Make sure it isn't a register that requires 64-bit mode.
53115 if (!Subtarget.is64Bit() &&
53116 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
53117 TRI->getEncodingValue(Res.first) >= 8) {
53118 // Register requires REX prefix, but we're in 32-bit mode.
53119 return std::make_pair(0, nullptr);
53120 }
53121
53122 // Make sure it isn't a register that requires AVX512.
53123 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
53124 TRI->getEncodingValue(Res.first) & 0x10) {
53125 // Register requires EVEX prefix.
53126 return std::make_pair(0, nullptr);
53127 }
53128
53129 // Otherwise, check to see if this is a register class of the wrong value
53130 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
53131 // turn into {ax},{dx}.
53132 // MVT::Other is used to specify clobber names.
53133 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
53134 return Res; // Correct type already, nothing to do.
53135
53136 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
53137 // return "eax". This should even work for things like getting 64bit integer
53138 // registers when given an f64 type.
53139 const TargetRegisterClass *Class = Res.second;
53140 // The generic code will match the first register class that contains the
53141 // given register. Thus, based on the ordering of the tablegened file,
53142 // the "plain" GR classes might not come first.
53143 // Therefore, use a helper method.
53144 if (isGRClass(*Class)) {
53145 unsigned Size = VT.getSizeInBits();
53146 if (Size == 1) Size = 8;
53147 Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
53148 if (DestReg > 0) {
53149 bool is64Bit = Subtarget.is64Bit();
53150 const TargetRegisterClass *RC =
53151 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
53152 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
53153 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
53154 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
53155 : nullptr;
53156 if (Size == 64 && !is64Bit) {
53157 // Model GCC's behavior here and select a fixed pair of 32-bit
53158 // registers.
53159 switch (DestReg) {
53160 case X86::RAX:
53161 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
53162 case X86::RDX:
53163 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
53164 case X86::RCX:
53165 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
53166 case X86::RBX:
53167 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
53168 case X86::RSI:
53169 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
53170 case X86::RDI:
53171 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
53172 case X86::RBP:
53173 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
53174 default:
53175 return std::make_pair(0, nullptr);
53176 }
53177 }
53178 if (RC && RC->contains(DestReg))
53179 return std::make_pair(DestReg, RC);
53180 return Res;
53181 }
53182 // No register found/type mismatch.
53183 return std::make_pair(0, nullptr);
53184 } else if (isFRClass(*Class)) {
53185 // Handle references to XMM physical registers that got mapped into the
53186 // wrong class. This can happen with constraints like {xmm0} where the
53187 // target independent register mapper will just pick the first match it can
53188 // find, ignoring the required type.
53189
53190 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
53191 if (VT == MVT::f32 || VT == MVT::i32)
53192 Res.second = &X86::FR32XRegClass;
53193 else if (VT == MVT::f64 || VT == MVT::i64)
53194 Res.second = &X86::FR64XRegClass;
53195 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
53196 Res.second = &X86::VR128XRegClass;
53197 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
53198 Res.second = &X86::VR256XRegClass;
53199 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
53200 Res.second = &X86::VR512RegClass;
53201 else {
53202 // Type mismatch and not a clobber: Return an error;
53203 Res.first = 0;
53204 Res.second = nullptr;
53205 }
53206 } else if (isVKClass(*Class)) {
53207 if (VT == MVT::i1)
53208 Res.second = &X86::VK1RegClass;
53209 else if (VT == MVT::i8)
53210 Res.second = &X86::VK8RegClass;
53211 else if (VT == MVT::i16)
53212 Res.second = &X86::VK16RegClass;
53213 else if (VT == MVT::i32)
53214 Res.second = &X86::VK32RegClass;
53215 else if (VT == MVT::i64)
53216 Res.second = &X86::VK64RegClass;
53217 else {
53218 // Type mismatch and not a clobber: Return an error;
53219 Res.first = 0;
53220 Res.second = nullptr;
53221 }
53222 }
53223
53224 return Res;
53225}
53226
53227InstructionCost X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
53228 const AddrMode &AM,
53229 Type *Ty,
53230 unsigned AS) const {
53231 // Scaling factors are not free at all.
53232 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
53233 // will take 2 allocations in the out of order engine instead of 1
53234 // for plain addressing mode, i.e. inst (reg1).
53235 // E.g.,
53236 // vaddps (%rsi,%rdx), %ymm0, %ymm1
53237 // Requires two allocations (one for the load, one for the computation)
53238 // whereas:
53239 // vaddps (%rsi), %ymm0, %ymm1
53240 // Requires just 1 allocation, i.e., freeing allocations for other operations
53241 // and having less micro operations to execute.
53242 //
53243 // For some X86 architectures, this is even worse because for instance for
53244 // stores, the complex addressing mode forces the instruction to use the
53245 // "load" ports instead of the dedicated "store" port.
53246 // E.g., on Haswell:
53247 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
53248 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
53249 if (isLegalAddressingMode(DL, AM, Ty, AS))
53250 // Scale represents reg2 * scale, thus account for 1
53251 // as soon as we use a second register.
53252 return AM.Scale != 0;
53253 return -1;
53254}
53255
53256bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
53257 // Integer division on x86 is expensive. However, when aggressively optimizing
53258 // for code size, we prefer to use a div instruction, as it is usually smaller
53259 // than the alternative sequence.
53260 // The exception to this is vector division. Since x86 doesn't have vector
53261 // integer division, leaving the division as-is is a loss even in terms of
53262 // size, because it will have to be scalarized, while the alternative code
53263 // sequence can be performed in vector form.
53264 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
53265 return OptSize && !VT.isVector();
53266}
53267
53268void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
53269 if (!Subtarget.is64Bit())
53270 return;
53271
53272 // Update IsSplitCSR in X86MachineFunctionInfo.
53273 X86MachineFunctionInfo *AFI =
53274 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
53275 AFI->setIsSplitCSR(true);
53276}
53277
53278void X86TargetLowering::insertCopiesSplitCSR(
53279 MachineBasicBlock *Entry,
53280 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
53281 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
53282 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
53283 if (!IStart)
53284 return;
53285
53286 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
53287 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
53288 MachineBasicBlock::iterator MBBI = Entry->begin();
53289 for (const MCPhysReg *I = IStart; *I; ++I) {
53290 const TargetRegisterClass *RC = nullptr;
53291 if (X86::GR64RegClass.contains(*I))
53292 RC = &X86::GR64RegClass;
53293 else
53294 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53294)
;
53295
53296 Register NewVR = MRI->createVirtualRegister(RC);
53297 // Create copy from CSR to a virtual register.
53298 // FIXME: this currently does not emit CFI pseudo-instructions, it works
53299 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
53300 // nounwind. If we want to generalize this later, we may need to emit
53301 // CFI pseudo-instructions.
53302 assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53304, __extension__ __PRETTY_FUNCTION__))
53303 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53304, __extension__ __PRETTY_FUNCTION__))
53304 "Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53304, __extension__ __PRETTY_FUNCTION__))
;
53305 Entry->addLiveIn(*I);
53306 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
53307 .addReg(*I);
53308
53309 // Insert the copy-back instructions right before the terminator.
53310 for (auto *Exit : Exits)
53311 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
53312 TII->get(TargetOpcode::COPY), *I)
53313 .addReg(NewVR);
53314 }
53315}
53316
53317bool X86TargetLowering::supportSwiftError() const {
53318 return Subtarget.is64Bit();
53319}
53320
53321/// Returns true if stack probing through a function call is requested.
53322bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
53323 return !getStackProbeSymbolName(MF).empty();
53324}
53325
53326/// Returns true if stack probing through inline assembly is requested.
53327bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
53328
53329 // No inline stack probe for Windows, they have their own mechanism.
53330 if (Subtarget.isOSWindows() ||
53331 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
53332 return false;
53333
53334 // If the function specifically requests inline stack probes, emit them.
53335 if (MF.getFunction().hasFnAttribute("probe-stack"))
53336 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
53337 "inline-asm";
53338
53339 return false;
53340}
53341
53342/// Returns the name of the symbol used to emit stack probes or the empty
53343/// string if not applicable.
53344StringRef
53345X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
53346 // Inline Stack probes disable stack probe call
53347 if (hasInlineStackProbe(MF))
53348 return "";
53349
53350 // If the function specifically requests stack probes, emit them.
53351 if (MF.getFunction().hasFnAttribute("probe-stack"))
53352 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
53353
53354 // Generally, if we aren't on Windows, the platform ABI does not include
53355 // support for stack probes, so don't emit them.
53356 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
53357 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
53358 return "";
53359
53360 // We need a stack probe to conform to the Windows ABI. Choose the right
53361 // symbol.
53362 if (Subtarget.is64Bit())
53363 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
53364 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
53365}
53366
53367unsigned
53368X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
53369 // The default stack probe size is 4096 if the function has no stackprobesize
53370 // attribute.
53371 unsigned StackProbeSize = 4096;
53372 const Function &Fn = MF.getFunction();
53373 if (Fn.hasFnAttribute("stack-probe-size"))
53374 Fn.getFnAttribute("stack-probe-size")
53375 .getValueAsString()
53376 .getAsInteger(0, StackProbeSize);
53377 return StackProbeSize;
53378}
53379
53380Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
53381 if (ML->isInnermost() &&
53382 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
53383 return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);
53384 return TargetLowering::getPrefLoopAlignment();
53385}