Bug Summary

File:llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:line 34637, column 5
Division by zero

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name X86ISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debugger-tuning=gdb -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-11/lib/clang/11.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86 -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/include -I /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-11/lib/clang/11.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/build-llvm/lib/Target/X86 -fdebug-prefix-map=/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347=. -ferror-limit 19 -fmessage-length 0 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -o /tmp/scan-build-2020-03-09-184146-41876-1 -x c++ /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp

1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
15#include "Utils/X86ShuffleDecode.h"
16#include "X86CallingConv.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
20#include "X86MachineFunctionInfo.h"
21#include "X86TargetMachine.h"
22#include "X86TargetObjectFile.h"
23#include "llvm/ADT/SmallBitVector.h"
24#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
26#include "llvm/ADT/StringExtras.h"
27#include "llvm/ADT/StringSwitch.h"
28#include "llvm/Analysis/BlockFrequencyInfo.h"
29#include "llvm/Analysis/EHPersonalities.h"
30#include "llvm/Analysis/ProfileSummaryInfo.h"
31#include "llvm/Analysis/VectorUtils.h"
32#include "llvm/CodeGen/IntrinsicLowering.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/MachineFunction.h"
35#include "llvm/CodeGen/MachineInstrBuilder.h"
36#include "llvm/CodeGen/MachineJumpTableInfo.h"
37#include "llvm/CodeGen/MachineModuleInfo.h"
38#include "llvm/CodeGen/MachineRegisterInfo.h"
39#include "llvm/CodeGen/TargetLowering.h"
40#include "llvm/CodeGen/WinEHFuncInfo.h"
41#include "llvm/IR/CallSite.h"
42#include "llvm/IR/CallingConv.h"
43#include "llvm/IR/Constants.h"
44#include "llvm/IR/DerivedTypes.h"
45#include "llvm/IR/DiagnosticInfo.h"
46#include "llvm/IR/Function.h"
47#include "llvm/IR/GlobalAlias.h"
48#include "llvm/IR/GlobalVariable.h"
49#include "llvm/IR/Instructions.h"
50#include "llvm/IR/Intrinsics.h"
51#include "llvm/MC/MCAsmInfo.h"
52#include "llvm/MC/MCContext.h"
53#include "llvm/MC/MCExpr.h"
54#include "llvm/MC/MCSymbol.h"
55#include "llvm/Support/CommandLine.h"
56#include "llvm/Support/Debug.h"
57#include "llvm/Support/ErrorHandling.h"
58#include "llvm/Support/KnownBits.h"
59#include "llvm/Support/MathExtras.h"
60#include "llvm/Target/TargetOptions.h"
61#include <algorithm>
62#include <bitset>
63#include <cctype>
64#include <numeric>
65using namespace llvm;
66
67#define DEBUG_TYPE"x86-isel" "x86-isel"
68
69STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"}
;
70
71static cl::opt<int> ExperimentalPrefLoopAlignment(
72 "x86-experimental-pref-loop-alignment", cl::init(4),
73 cl::desc(
74 "Sets the preferable loop alignment for experiments (as log2 bytes)"
75 "(the last x86-experimental-pref-loop-alignment bits"
76 " of the loop header PC will be 0)."),
77 cl::Hidden);
78
79// Added in 10.0.
80static cl::opt<bool> EnableOldKNLABI(
81 "x86-enable-old-knl-abi", cl::init(false),
82 cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
83 "one ZMM register on AVX512F, but not AVX512BW targets."),
84 cl::Hidden);
85
86static cl::opt<bool> MulConstantOptimization(
87 "mul-constant-optimization", cl::init(true),
88 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 "SHIFT, LEA, etc."),
90 cl::Hidden);
91
92static cl::opt<bool> ExperimentalUnorderedISEL(
93 "x86-experimental-unordered-atomic-isel", cl::init(false),
94 cl::desc("Use LoadSDNode and StoreSDNode instead of "
95 "AtomicSDNode for unordered atomic loads and "
96 "stores respectively."),
97 cl::Hidden);
98
99/// Call this when the user attempts to do something unsupported, like
100/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
101/// report_fatal_error, so calling code should attempt to recover without
102/// crashing.
103static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
104 const char *Msg) {
105 MachineFunction &MF = DAG.getMachineFunction();
106 DAG.getContext()->diagnose(
107 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
108}
109
110X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
111 const X86Subtarget &STI)
112 : TargetLowering(TM), Subtarget(STI) {
113 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
114 X86ScalarSSEf64 = Subtarget.hasSSE2();
115 X86ScalarSSEf32 = Subtarget.hasSSE1();
116 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
117
118 // Set up the TargetLowering object.
119
120 // X86 is weird. It always uses i8 for shift amounts and setcc results.
121 setBooleanContents(ZeroOrOneBooleanContent);
122 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
123 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
124
125 // For 64-bit, since we have so many registers, use the ILP scheduler.
126 // For 32-bit, use the register pressure specific scheduling.
127 // For Atom, always use ILP scheduling.
128 if (Subtarget.isAtom())
129 setSchedulingPreference(Sched::ILP);
130 else if (Subtarget.is64Bit())
131 setSchedulingPreference(Sched::ILP);
132 else
133 setSchedulingPreference(Sched::RegPressure);
134 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
135 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
136
137 // Bypass expensive divides and use cheaper ones.
138 if (TM.getOptLevel() >= CodeGenOpt::Default) {
139 if (Subtarget.hasSlowDivide32())
140 addBypassSlowDiv(32, 8);
141 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
142 addBypassSlowDiv(64, 32);
143 }
144
145 if (Subtarget.isTargetWindowsMSVC() ||
146 Subtarget.isTargetWindowsItanium()) {
147 // Setup Windows compiler runtime calls.
148 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
149 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
150 setLibcallName(RTLIB::SREM_I64, "_allrem");
151 setLibcallName(RTLIB::UREM_I64, "_aullrem");
152 setLibcallName(RTLIB::MUL_I64, "_allmul");
153 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
154 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
155 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
156 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
157 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
158 }
159
160 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
161 // MSVCRT doesn't have powi; fall back to pow
162 setLibcallName(RTLIB::POWI_F32, nullptr);
163 setLibcallName(RTLIB::POWI_F64, nullptr);
164 }
165
166 // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
167 // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
168 // FIXME: Should we be limiting the atomic size on other configs? Default is
169 // 1024.
170 if (!Subtarget.hasCmpxchg8b())
171 setMaxAtomicSizeInBitsSupported(32);
172
173 // Set up the register classes.
174 addRegisterClass(MVT::i8, &X86::GR8RegClass);
175 addRegisterClass(MVT::i16, &X86::GR16RegClass);
176 addRegisterClass(MVT::i32, &X86::GR32RegClass);
177 if (Subtarget.is64Bit())
178 addRegisterClass(MVT::i64, &X86::GR64RegClass);
179
180 for (MVT VT : MVT::integer_valuetypes())
181 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
182
183 // We don't accept any truncstore of integer registers.
184 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
185 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
186 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
187 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
188 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
189 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
190
191 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
192
193 // SETOEQ and SETUNE require checking two conditions.
194 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
195 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
196 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
197 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
198 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
199 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
200
201 // Integer absolute.
202 if (Subtarget.hasCMov()) {
203 setOperationAction(ISD::ABS , MVT::i16 , Custom);
204 setOperationAction(ISD::ABS , MVT::i32 , Custom);
205 }
206 setOperationAction(ISD::ABS , MVT::i64 , Custom);
207
208 // Funnel shifts.
209 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
210 setOperationAction(ShiftOp , MVT::i16 , Custom);
211 setOperationAction(ShiftOp , MVT::i32 , Custom);
212 if (Subtarget.is64Bit())
213 setOperationAction(ShiftOp , MVT::i64 , Custom);
214 }
215
216 if (!Subtarget.useSoftFloat()) {
217 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
218 // operation.
219 setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
220 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
221 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
222 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
223 // We have an algorithm for SSE2, and we turn this into a 64-bit
224 // FILD or VCVTUSI2SS/SD for other targets.
225 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
226 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
227 // We have an algorithm for SSE2->double, and we turn this into a
228 // 64-bit FILD followed by conditional FADD for other targets.
229 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
230 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
231
232 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
233 // this operation.
234 setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
235 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
236 // SSE has no i16 to fp conversion, only i32. We promote in the handler
237 // to allow f80 to use i16 and f64 to use i16 with sse1 only
238 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
239 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
240 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
241 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
242 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
243 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
244 // are Legal, f80 is custom lowered.
245 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
246 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
247
248 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
249 // this operation.
250 setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
251 // FIXME: This doesn't generate invalid exception when it should. PR44019.
252 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
253 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
254 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
255 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
256 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
257 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
258 // are Legal, f80 is custom lowered.
259 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
260 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
261
262 // Handle FP_TO_UINT by promoting the destination to a larger signed
263 // conversion.
264 setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
265 // FIXME: This doesn't generate invalid exception when it should. PR44019.
266 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
267 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
268 // FIXME: This doesn't generate invalid exception when it should. PR44019.
269 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
270 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
271 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
272 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
273 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
274
275 setOperationAction(ISD::LRINT, MVT::f32, Custom);
276 setOperationAction(ISD::LRINT, MVT::f64, Custom);
277 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
278 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
279
280 if (!Subtarget.is64Bit()) {
281 setOperationAction(ISD::LRINT, MVT::i64, Custom);
282 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
283 }
284 }
285
286 // Handle address space casts between mixed sized pointers.
287 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
288 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
289
290 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
291 if (!X86ScalarSSEf64) {
292 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
293 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
294 if (Subtarget.is64Bit()) {
295 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
296 // Without SSE, i64->f64 goes through memory.
297 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
298 }
299 } else if (!Subtarget.is64Bit())
300 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
301
302 // Scalar integer divide and remainder are lowered to use operations that
303 // produce two results, to match the available instructions. This exposes
304 // the two-result form to trivial CSE, which is able to combine x/y and x%y
305 // into a single instruction.
306 //
307 // Scalar integer multiply-high is also lowered to use two-result
308 // operations, to match the available instructions. However, plain multiply
309 // (low) operations are left as Legal, as there are single-result
310 // instructions for this in x86. Using the two-result multiply instructions
311 // when both high and low results are needed must be arranged by dagcombine.
312 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
313 setOperationAction(ISD::MULHS, VT, Expand);
314 setOperationAction(ISD::MULHU, VT, Expand);
315 setOperationAction(ISD::SDIV, VT, Expand);
316 setOperationAction(ISD::UDIV, VT, Expand);
317 setOperationAction(ISD::SREM, VT, Expand);
318 setOperationAction(ISD::UREM, VT, Expand);
319 }
320
321 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
322 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
323 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
324 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
325 setOperationAction(ISD::BR_CC, VT, Expand);
326 setOperationAction(ISD::SELECT_CC, VT, Expand);
327 }
328 if (Subtarget.is64Bit())
329 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
330 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
331 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
332 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
333
334 setOperationAction(ISD::FREM , MVT::f32 , Expand);
335 setOperationAction(ISD::FREM , MVT::f64 , Expand);
336 setOperationAction(ISD::FREM , MVT::f80 , Expand);
337 setOperationAction(ISD::FREM , MVT::f128 , Expand);
338 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
339
340 // Promote the i8 variants and force them on up to i32 which has a shorter
341 // encoding.
342 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
343 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
344 if (!Subtarget.hasBMI()) {
345 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
346 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
347 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
348 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
349 if (Subtarget.is64Bit()) {
350 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
351 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
352 }
353 }
354
355 if (Subtarget.hasLZCNT()) {
356 // When promoting the i8 variants, force them to i32 for a shorter
357 // encoding.
358 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
359 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
360 } else {
361 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
362 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
363 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
364 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
365 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
366 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
367 if (Subtarget.is64Bit()) {
368 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
369 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
370 }
371 }
372
373 // Special handling for half-precision floating point conversions.
374 // If we don't have F16C support, then lower half float conversions
375 // into library calls.
376 if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) {
377 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Custom);
378 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, Custom);
379 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
380 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, Custom);
381 } else {
382 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
383 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, Expand);
384 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
385 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, Expand);
386 }
387
388 // There's never any support for operations beyond MVT::f32.
389 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
390 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
391 setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
392 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, Expand);
393 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f80, Expand);
394 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f128, Expand);
395 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
396 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
397 setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
398 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, Expand);
399 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f80, Expand);
400 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f128, Expand);
401
402 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
403 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
404 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
405 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand);
406 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
407 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
408 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
409 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
410
411 if (Subtarget.hasPOPCNT()) {
412 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
413 } else {
414 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
415 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
416 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
417 if (Subtarget.is64Bit())
418 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
419 else
420 setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
421 }
422
423 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
424
425 if (!Subtarget.hasMOVBE())
426 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
427
428 // X86 wants to expand cmov itself.
429 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
430 setOperationAction(ISD::SELECT, VT, Custom);
431 setOperationAction(ISD::SETCC, VT, Custom);
432 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
433 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
434 }
435 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
436 if (VT == MVT::i64 && !Subtarget.is64Bit())
437 continue;
438 setOperationAction(ISD::SELECT, VT, Custom);
439 setOperationAction(ISD::SETCC, VT, Custom);
440 }
441
442 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
443 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
444 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
445
446 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
447 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
448 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
449 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
450 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
451 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
452 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
453 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
454
455 // Darwin ABI issue.
456 for (auto VT : { MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
458 continue;
459 setOperationAction(ISD::ConstantPool , VT, Custom);
460 setOperationAction(ISD::JumpTable , VT, Custom);
461 setOperationAction(ISD::GlobalAddress , VT, Custom);
462 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
463 setOperationAction(ISD::ExternalSymbol , VT, Custom);
464 setOperationAction(ISD::BlockAddress , VT, Custom);
465 }
466
467 // 64-bit shl, sra, srl (iff 32-bit x86)
468 for (auto VT : { MVT::i32, MVT::i64 }) {
469 if (VT == MVT::i64 && !Subtarget.is64Bit())
470 continue;
471 setOperationAction(ISD::SHL_PARTS, VT, Custom);
472 setOperationAction(ISD::SRA_PARTS, VT, Custom);
473 setOperationAction(ISD::SRL_PARTS, VT, Custom);
474 }
475
476 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
477 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
478
479 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
480
481 // Expand certain atomics
482 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
483 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
484 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
485 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
486 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
487 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
488 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
489 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
490 }
491
492 if (!Subtarget.is64Bit())
493 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
494
495 if (Subtarget.hasCmpxchg16b()) {
496 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
497 }
498
499 // FIXME - use subtarget debug flags
500 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
501 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
502 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
503 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
504 }
505
506 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
507 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
508
509 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
510 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
511
512 setOperationAction(ISD::TRAP, MVT::Other, Legal);
513 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
514
515 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
516 setOperationAction(ISD::VASTART , MVT::Other, Custom);
517 setOperationAction(ISD::VAEND , MVT::Other, Expand);
518 bool Is64Bit = Subtarget.is64Bit();
519 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
520 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
521
522 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
523 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
524
525 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
526
527 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
528 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
529 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
530
531 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
532 // f32 and f64 use SSE.
533 // Set up the FP register classes.
534 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
535 : &X86::FR32RegClass);
536 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
537 : &X86::FR64RegClass);
538
539 // Disable f32->f64 extload as we can only generate this in one instruction
540 // under optsize. So its easier to pattern match (fpext (load)) for that
541 // case instead of needing to emit 2 instructions for extload in the
542 // non-optsize case.
543 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
544
545 for (auto VT : { MVT::f32, MVT::f64 }) {
546 // Use ANDPD to simulate FABS.
547 setOperationAction(ISD::FABS, VT, Custom);
548
549 // Use XORP to simulate FNEG.
550 setOperationAction(ISD::FNEG, VT, Custom);
551
552 // Use ANDPD and ORPD to simulate FCOPYSIGN.
553 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
554
555 // These might be better off as horizontal vector ops.
556 setOperationAction(ISD::FADD, VT, Custom);
557 setOperationAction(ISD::FSUB, VT, Custom);
558
559 // We don't support sin/cos/fmod
560 setOperationAction(ISD::FSIN , VT, Expand);
561 setOperationAction(ISD::FCOS , VT, Expand);
562 setOperationAction(ISD::FSINCOS, VT, Expand);
563 }
564
565 // Lower this to MOVMSK plus an AND.
566 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
567 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
568
569 } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
570 (UseX87 || Is64Bit)) {
571 // Use SSE for f32, x87 for f64.
572 // Set up the FP register classes.
573 addRegisterClass(MVT::f32, &X86::FR32RegClass);
574 if (UseX87)
575 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
576
577 // Use ANDPS to simulate FABS.
578 setOperationAction(ISD::FABS , MVT::f32, Custom);
579
580 // Use XORP to simulate FNEG.
581 setOperationAction(ISD::FNEG , MVT::f32, Custom);
582
583 if (UseX87)
584 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
585
586 // Use ANDPS and ORPS to simulate FCOPYSIGN.
587 if (UseX87)
588 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
589 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
590
591 // We don't support sin/cos/fmod
592 setOperationAction(ISD::FSIN , MVT::f32, Expand);
593 setOperationAction(ISD::FCOS , MVT::f32, Expand);
594 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
595
596 if (UseX87) {
597 // Always expand sin/cos functions even though x87 has an instruction.
598 setOperationAction(ISD::FSIN, MVT::f64, Expand);
599 setOperationAction(ISD::FCOS, MVT::f64, Expand);
600 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
601 }
602 } else if (UseX87) {
603 // f32 and f64 in x87.
604 // Set up the FP register classes.
605 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
606 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
607
608 for (auto VT : { MVT::f32, MVT::f64 }) {
609 setOperationAction(ISD::UNDEF, VT, Expand);
610 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
611
612 // Always expand sin/cos functions even though x87 has an instruction.
613 setOperationAction(ISD::FSIN , VT, Expand);
614 setOperationAction(ISD::FCOS , VT, Expand);
615 setOperationAction(ISD::FSINCOS, VT, Expand);
616 }
617 }
618
619 // Expand FP32 immediates into loads from the stack, save special cases.
620 if (isTypeLegal(MVT::f32)) {
621 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
622 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
623 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
624 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
625 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
626 } else // SSE immediates.
627 addLegalFPImmediate(APFloat(+0.0f)); // xorps
628 }
629 // Expand FP64 immediates into loads from the stack, save special cases.
630 if (isTypeLegal(MVT::f64)) {
631 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
632 addLegalFPImmediate(APFloat(+0.0)); // FLD0
633 addLegalFPImmediate(APFloat(+1.0)); // FLD1
634 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
635 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
636 } else // SSE immediates.
637 addLegalFPImmediate(APFloat(+0.0)); // xorpd
638 }
639 // Handle constrained floating-point operations of scalar.
640 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
641 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
642 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
643 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
644 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
645 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
646 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
647 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
648 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
649 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
650 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
651 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
652 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
653
654 // We don't support FMA.
655 setOperationAction(ISD::FMA, MVT::f64, Expand);
656 setOperationAction(ISD::FMA, MVT::f32, Expand);
657
658 // f80 always uses X87.
659 if (UseX87) {
660 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
661 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
662 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
663 {
664 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
665 addLegalFPImmediate(TmpFlt); // FLD0
666 TmpFlt.changeSign();
667 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
668
669 bool ignored;
670 APFloat TmpFlt2(+1.0);
671 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
672 &ignored);
673 addLegalFPImmediate(TmpFlt2); // FLD1
674 TmpFlt2.changeSign();
675 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
676 }
677
678 // Always expand sin/cos functions even though x87 has an instruction.
679 setOperationAction(ISD::FSIN , MVT::f80, Expand);
680 setOperationAction(ISD::FCOS , MVT::f80, Expand);
681 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
682
683 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
684 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
685 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
686 setOperationAction(ISD::FRINT, MVT::f80, Expand);
687 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
688 setOperationAction(ISD::FMA, MVT::f80, Expand);
689 setOperationAction(ISD::LROUND, MVT::f80, Expand);
690 setOperationAction(ISD::LLROUND, MVT::f80, Expand);
691 setOperationAction(ISD::LRINT, MVT::f80, Custom);
692 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
693
694 // Handle constrained floating-point operations of scalar.
695 setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
696 setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
697 setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
698 setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
699 setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
700 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
701 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
702 // as Custom.
703 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
704 }
705
706 // f128 uses xmm registers, but most operations require libcalls.
707 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
708 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
709 : &X86::VR128RegClass);
710
711 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
712
713 setOperationAction(ISD::FADD, MVT::f128, LibCall);
714 setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
715 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
716 setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
717 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
718 setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
719 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
720 setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
721 setOperationAction(ISD::FMA, MVT::f128, LibCall);
722 setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
723
724 setOperationAction(ISD::FABS, MVT::f128, Custom);
725 setOperationAction(ISD::FNEG, MVT::f128, Custom);
726 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
727
728 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
729 setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
730 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
731 setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
732 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
733 // No STRICT_FSINCOS
734 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
735 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
736
737 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
738 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
739 // We need to custom handle any FP_ROUND with an f128 input, but
740 // LegalizeDAG uses the result type to know when to run a custom handler.
741 // So we have to list all legal floating point result types here.
742 if (isTypeLegal(MVT::f32)) {
743 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
744 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
745 }
746 if (isTypeLegal(MVT::f64)) {
747 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
748 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
749 }
750 if (isTypeLegal(MVT::f80)) {
751 setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);
752 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);
753 }
754
755 setOperationAction(ISD::SETCC, MVT::f128, Custom);
756
757 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
758 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
759 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
760 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
761 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
762 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
763 }
764
765 // Always use a library call for pow.
766 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
767 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
768 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
769 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
770
771 setOperationAction(ISD::FLOG, MVT::f80, Expand);
772 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
773 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
774 setOperationAction(ISD::FEXP, MVT::f80, Expand);
775 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
776 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
777 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
778
779 // Some FP actions are always expanded for vector types.
780 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
781 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
782 setOperationAction(ISD::FSIN, VT, Expand);
783 setOperationAction(ISD::FSINCOS, VT, Expand);
784 setOperationAction(ISD::FCOS, VT, Expand);
785 setOperationAction(ISD::FREM, VT, Expand);
786 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
787 setOperationAction(ISD::FPOW, VT, Expand);
788 setOperationAction(ISD::FLOG, VT, Expand);
789 setOperationAction(ISD::FLOG2, VT, Expand);
790 setOperationAction(ISD::FLOG10, VT, Expand);
791 setOperationAction(ISD::FEXP, VT, Expand);
792 setOperationAction(ISD::FEXP2, VT, Expand);
793 }
794
795 // First set operation action for all vector types to either promote
796 // (for widening) or expand (for scalarization). Then we will selectively
797 // turn on ones that can be effectively codegen'd.
798 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
799 setOperationAction(ISD::SDIV, VT, Expand);
800 setOperationAction(ISD::UDIV, VT, Expand);
801 setOperationAction(ISD::SREM, VT, Expand);
802 setOperationAction(ISD::UREM, VT, Expand);
803 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
804 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
805 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
806 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
807 setOperationAction(ISD::FMA, VT, Expand);
808 setOperationAction(ISD::FFLOOR, VT, Expand);
809 setOperationAction(ISD::FCEIL, VT, Expand);
810 setOperationAction(ISD::FTRUNC, VT, Expand);
811 setOperationAction(ISD::FRINT, VT, Expand);
812 setOperationAction(ISD::FNEARBYINT, VT, Expand);
813 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
814 setOperationAction(ISD::MULHS, VT, Expand);
815 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
816 setOperationAction(ISD::MULHU, VT, Expand);
817 setOperationAction(ISD::SDIVREM, VT, Expand);
818 setOperationAction(ISD::UDIVREM, VT, Expand);
819 setOperationAction(ISD::CTPOP, VT, Expand);
820 setOperationAction(ISD::CTTZ, VT, Expand);
821 setOperationAction(ISD::CTLZ, VT, Expand);
822 setOperationAction(ISD::ROTL, VT, Expand);
823 setOperationAction(ISD::ROTR, VT, Expand);
824 setOperationAction(ISD::BSWAP, VT, Expand);
825 setOperationAction(ISD::SETCC, VT, Expand);
826 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
827 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
828 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
829 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
830 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
831 setOperationAction(ISD::TRUNCATE, VT, Expand);
832 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
833 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
834 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
835 setOperationAction(ISD::SELECT_CC, VT, Expand);
836 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
837 setTruncStoreAction(InnerVT, VT, Expand);
838
839 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
840 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
841
842 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
843 // types, we have to deal with them whether we ask for Expansion or not.
844 // Setting Expand causes its own optimisation problems though, so leave
845 // them legal.
846 if (VT.getVectorElementType() == MVT::i1)
847 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
848
849 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
850 // split/scalarized right now.
851 if (VT.getVectorElementType() == MVT::f16)
852 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
853 }
854 }
855
856 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
857 // with -msoft-float, disable use of MMX as well.
858 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
859 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
860 // No operations on x86mmx supported, everything uses intrinsics.
861 }
862
863 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
864 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
865 : &X86::VR128RegClass);
866
867 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
868 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
869 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
870 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
871 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
872 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
873 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
874 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
875
876 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
877 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
878
879 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
880 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
881 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
882 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
883 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
884 }
885
886 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
887 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
888 : &X86::VR128RegClass);
889
890 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
891 // registers cannot be used even for integer operations.
892 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
893 : &X86::VR128RegClass);
894 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
895 : &X86::VR128RegClass);
896 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
897 : &X86::VR128RegClass);
898 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
899 : &X86::VR128RegClass);
900
901 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
902 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
903 setOperationAction(ISD::SDIV, VT, Custom);
904 setOperationAction(ISD::SREM, VT, Custom);
905 setOperationAction(ISD::UDIV, VT, Custom);
906 setOperationAction(ISD::UREM, VT, Custom);
907 }
908
909 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
910 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
911 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
912
913 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
914 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
915 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
916 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
917 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
918 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
919 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
920 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
921 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
922 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
923 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
924 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
925 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
926
927 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
928 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
929 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
930 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
931 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
932 }
933
934 setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
935 setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
936 setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
937 setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
938 setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
939 setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
940 setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
941 setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
942 setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
943 setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
944 setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
945 setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
946
947 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
948 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
949 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
950
951 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
952 setOperationAction(ISD::SETCC, VT, Custom);
953 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
954 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
955 setOperationAction(ISD::CTPOP, VT, Custom);
956 setOperationAction(ISD::ABS, VT, Custom);
957
958 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
959 // setcc all the way to isel and prefer SETGT in some isel patterns.
960 setCondCodeAction(ISD::SETLT, VT, Custom);
961 setCondCodeAction(ISD::SETLE, VT, Custom);
962 }
963
964 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
965 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
966 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
967 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
968 setOperationAction(ISD::VSELECT, VT, Custom);
969 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
970 }
971
972 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
973 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
974 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
975 setOperationAction(ISD::VSELECT, VT, Custom);
976
977 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
978 continue;
979
980 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
981 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
982 }
983
984 // Custom lower v2i64 and v2f64 selects.
985 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
986 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
987 setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
988 setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
989 setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
990
991 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
992 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
993 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
994 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
995
996 // Custom legalize these to avoid over promotion or custom promotion.
997 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
998 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
999 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1000 setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
1001 setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
1002 }
1003
1004 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1005 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
1006 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
1007 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
1008
1009 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
1010 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
1011
1012 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
1013 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
1014
1015 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1016 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1017 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
1018 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
1019 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
1020
1021 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1022 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
1023 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
1024 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
1025
1026 // We want to legalize this to an f64 load rather than an i64 load on
1027 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1028 // store.
1029 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1030 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1031 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1032 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1033 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1034 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1035
1036 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1037 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1038 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1039 if (!Subtarget.hasAVX512())
1040 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1041
1042 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
1043 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
1044 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
1045
1046 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1047
1048 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
1049 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
1050 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
1051 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
1052 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
1053 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
1054
1055 // In the customized shift lowering, the legal v4i32/v2i64 cases
1056 // in AVX2 will be recognized.
1057 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1058 setOperationAction(ISD::SRL, VT, Custom);
1059 setOperationAction(ISD::SHL, VT, Custom);
1060 setOperationAction(ISD::SRA, VT, Custom);
1061 }
1062
1063 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
1064 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
1065
1066 // With AVX512, expanding (and promoting the shifts) is better.
1067 if (!Subtarget.hasAVX512())
1068 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
1069
1070 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1071 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1072 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1073 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1074 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1075 }
1076
1077 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1078 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1079 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1080 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1081 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
1082 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1083 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1084 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1085 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1086
1087 // These might be better off as horizontal vector ops.
1088 setOperationAction(ISD::ADD, MVT::i16, Custom);
1089 setOperationAction(ISD::ADD, MVT::i32, Custom);
1090 setOperationAction(ISD::SUB, MVT::i16, Custom);
1091 setOperationAction(ISD::SUB, MVT::i32, Custom);
1092 }
1093
1094 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1095 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1096 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1097 setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
1098 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1099 setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
1100 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1101 setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
1102 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1103 setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
1104 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1105 setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
1106
1107 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1108 }
1109
1110 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1111 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1112 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1113 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1114 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1115 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1116 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1117 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1118
1119 // FIXME: Do we need to handle scalar-to-vector here?
1120 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1121
1122 // We directly match byte blends in the backend as they match the VSELECT
1123 // condition form.
1124 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
1125
1126 // SSE41 brings specific instructions for doing vector sign extend even in
1127 // cases where we don't have SRA.
1128 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1129 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
1130 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
1131 }
1132
1133 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1134 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1135 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1136 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1137 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1138 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1139 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1140 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1141 }
1142
1143 // i8 vectors are custom because the source register and source
1144 // source memory operand types are not the same width.
1145 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1146
1147 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1148 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1149 // do the pre and post work in the vector domain.
1150 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
1151 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
1152 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1153 // so that DAG combine doesn't try to turn it into uint_to_fp.
1154 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
1155 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
1156 }
1157 }
1158
1159 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1160 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1161 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1162 setOperationAction(ISD::ROTL, VT, Custom);
1163
1164 // XOP can efficiently perform BITREVERSE with VPPERM.
1165 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1166 setOperationAction(ISD::BITREVERSE, VT, Custom);
1167
1168 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1169 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1170 setOperationAction(ISD::BITREVERSE, VT, Custom);
1171 }
1172
1173 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1174 bool HasInt256 = Subtarget.hasInt256();
1175
1176 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1177 : &X86::VR256RegClass);
1178 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1179 : &X86::VR256RegClass);
1180 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1181 : &X86::VR256RegClass);
1182 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1183 : &X86::VR256RegClass);
1184 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1185 : &X86::VR256RegClass);
1186 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1187 : &X86::VR256RegClass);
1188
1189 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1190 setOperationAction(ISD::FFLOOR, VT, Legal);
1191 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1192 setOperationAction(ISD::FCEIL, VT, Legal);
1193 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1194 setOperationAction(ISD::FTRUNC, VT, Legal);
1195 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1196 setOperationAction(ISD::FRINT, VT, Legal);
1197 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1198 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1199 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1200
1201 setOperationAction(ISD::FROUND, VT, Custom);
1202
1203 setOperationAction(ISD::FNEG, VT, Custom);
1204 setOperationAction(ISD::FABS, VT, Custom);
1205 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1206 }
1207
1208 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1209 // even though v8i16 is a legal type.
1210 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1211 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1212 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1213 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1214 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1215 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
1216
1217 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1218 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
1219
1220 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
1221 setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
1222 setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
1223 setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
1224 setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
1225 setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
1226 setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
1227 setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
1228 setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
1229 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
1230 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
1231 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
1232
1233 if (!Subtarget.hasAVX512())
1234 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1235
1236 // In the customized shift lowering, the legal v8i32/v4i64 cases
1237 // in AVX2 will be recognized.
1238 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1239 setOperationAction(ISD::SRL, VT, Custom);
1240 setOperationAction(ISD::SHL, VT, Custom);
1241 setOperationAction(ISD::SRA, VT, Custom);
1242 }
1243
1244 // These types need custom splitting if their input is a 128-bit vector.
1245 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1246 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1247 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1248 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1249
1250 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1251 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1252
1253 // With BWI, expanding (and promoting the shifts) is the better.
1254 if (!Subtarget.hasBWI())
1255 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1256
1257 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1258 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1259 setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
1260 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1261 setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
1262 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1263
1264 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1265 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1266 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1267 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1268 }
1269
1270 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1271 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1272 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1273 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1274
1275 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1276 setOperationAction(ISD::SETCC, VT, Custom);
1277 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1278 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1279 setOperationAction(ISD::CTPOP, VT, Custom);
1280 setOperationAction(ISD::CTLZ, VT, Custom);
1281
1282 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1283 // setcc all the way to isel and prefer SETGT in some isel patterns.
1284 setCondCodeAction(ISD::SETLT, VT, Custom);
1285 setCondCodeAction(ISD::SETLE, VT, Custom);
1286 }
1287
1288 if (Subtarget.hasAnyFMA()) {
1289 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1290 MVT::v2f64, MVT::v4f64 }) {
1291 setOperationAction(ISD::FMA, VT, Legal);
1292 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1293 }
1294 }
1295
1296 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1297 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1298 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1299 }
1300
1301 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1302 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1303 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1304 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1305
1306 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1307 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1308 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1309 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1310 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1311 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1312
1313 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1314 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1315 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1316 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1317 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1318
1319 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1320 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1321 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1322 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1323 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1324 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1325 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1326 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1327
1328 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1329 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1330 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1331 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1332 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1333 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1334 }
1335
1336 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1337 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1338 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1339 }
1340
1341 if (HasInt256) {
1342 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1343 // when we have a 256bit-wide blend with immediate.
1344 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1345 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
1346
1347 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1348 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1349 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1350 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1351 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1352 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1353 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1354 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1355 }
1356 }
1357
1358 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1359 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1360 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1361 setOperationAction(ISD::MSTORE, VT, Legal);
1362 }
1363
1364 // Extract subvector is special because the value type
1365 // (result) is 128-bit but the source is 256-bit wide.
1366 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1367 MVT::v4f32, MVT::v2f64 }) {
1368 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1369 }
1370
1371 // Custom lower several nodes for 256-bit types.
1372 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1373 MVT::v8f32, MVT::v4f64 }) {
1374 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1375 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1376 setOperationAction(ISD::VSELECT, VT, Custom);
1377 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1378 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1379 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1380 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1381 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1382 setOperationAction(ISD::STORE, VT, Custom);
1383 }
1384
1385 if (HasInt256) {
1386 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1387
1388 // Custom legalize 2x32 to get a little better code.
1389 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1390 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1391
1392 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1393 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1394 setOperationAction(ISD::MGATHER, VT, Custom);
1395 }
1396 }
1397
1398 // This block controls legalization of the mask vector sizes that are
1399 // available with AVX512. 512-bit vectors are in a separate block controlled
1400 // by useAVX512Regs.
1401 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1402 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1403 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1404 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1405 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1406 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1407
1408 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1409 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1410 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1411
1412 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1413 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1414 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1415 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1416 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1417 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1418 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1419 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1420 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1421 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1422 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
1423 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
1424
1425 // There is no byte sized k-register load or store without AVX512DQ.
1426 if (!Subtarget.hasDQI()) {
1427 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1428 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1429 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1430 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1431
1432 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1433 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1434 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1435 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1436 }
1437
1438 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1439 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1440 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1441 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1442 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1443 }
1444
1445 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1446 setOperationAction(ISD::ADD, VT, Custom);
1447 setOperationAction(ISD::SUB, VT, Custom);
1448 setOperationAction(ISD::MUL, VT, Custom);
1449 setOperationAction(ISD::UADDSAT, VT, Custom);
1450 setOperationAction(ISD::SADDSAT, VT, Custom);
1451 setOperationAction(ISD::USUBSAT, VT, Custom);
1452 setOperationAction(ISD::SSUBSAT, VT, Custom);
1453 setOperationAction(ISD::VSELECT, VT, Expand);
1454 }
1455
1456 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1457 setOperationAction(ISD::SETCC, VT, Custom);
1458 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1459 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1460 setOperationAction(ISD::SELECT, VT, Custom);
1461 setOperationAction(ISD::TRUNCATE, VT, Custom);
1462
1463 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1464 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1465 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1466 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1467 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1468 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1469 }
1470
1471 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1472 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1473 }
1474
1475 // This block controls legalization for 512-bit operations with 32/64 bit
1476 // elements. 512-bits can be disabled based on prefer-vector-width and
1477 // required-vector-width function attributes.
1478 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1479 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1480 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1481 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1482 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1483
1484 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1485 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1486 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1487 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1488 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1489 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1490 }
1491
1492 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1493 setOperationAction(ISD::FNEG, VT, Custom);
1494 setOperationAction(ISD::FABS, VT, Custom);
1495 setOperationAction(ISD::FMA, VT, Legal);
1496 setOperationAction(ISD::STRICT_FMA, VT, Legal);
1497 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1498 }
1499
1500 for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1501 setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
1502 setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
1503 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
1504 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
1505 }
1506 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1507 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1508 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
1509 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
1510 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1511 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1512 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
1513 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
1514
1515 setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
1516 setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
1517 setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
1518 setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
1519 setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
1520 setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
1521 setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
1522 setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
1523 setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
1524 setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
1525 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
1526 setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
1527
1528 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1529 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1530 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1531 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1532 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1533
1534 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1535 // to 512-bit rather than use the AVX2 instructions so that we can use
1536 // k-masks.
1537 if (!Subtarget.hasVLX()) {
1538 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1539 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1540 setOperationAction(ISD::MLOAD, VT, Custom);
1541 setOperationAction(ISD::MSTORE, VT, Custom);
1542 }
1543 }
1544
1545 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1546 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1547 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1548 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1549 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1550 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1551 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1552 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1553
1554 // Need to custom widen this if we don't have AVX512BW.
1555 setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
1556 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
1557 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
1558
1559 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1560 setOperationAction(ISD::FFLOOR, VT, Legal);
1561 setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
1562 setOperationAction(ISD::FCEIL, VT, Legal);
1563 setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
1564 setOperationAction(ISD::FTRUNC, VT, Legal);
1565 setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
1566 setOperationAction(ISD::FRINT, VT, Legal);
1567 setOperationAction(ISD::STRICT_FRINT, VT, Legal);
1568 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1569 setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
1570
1571 setOperationAction(ISD::FROUND, VT, Custom);
1572
1573 setOperationAction(ISD::SELECT, VT, Custom);
1574 }
1575
1576 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1577 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
1578 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
1579 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
1580 }
1581
1582 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1583 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1584 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1585 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1586
1587 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1588 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1589
1590 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1591 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1592
1593 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1594 setOperationAction(ISD::SMAX, VT, Legal);
1595 setOperationAction(ISD::UMAX, VT, Legal);
1596 setOperationAction(ISD::SMIN, VT, Legal);
1597 setOperationAction(ISD::UMIN, VT, Legal);
1598 setOperationAction(ISD::ABS, VT, Legal);
1599 setOperationAction(ISD::SRL, VT, Custom);
1600 setOperationAction(ISD::SHL, VT, Custom);
1601 setOperationAction(ISD::SRA, VT, Custom);
1602 setOperationAction(ISD::CTPOP, VT, Custom);
1603 setOperationAction(ISD::ROTL, VT, Custom);
1604 setOperationAction(ISD::ROTR, VT, Custom);
1605 setOperationAction(ISD::SETCC, VT, Custom);
1606 setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
1607 setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
1608 setOperationAction(ISD::SELECT, VT, Custom);
1609
1610 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1611 // setcc all the way to isel and prefer SETGT in some isel patterns.
1612 setCondCodeAction(ISD::SETLT, VT, Custom);
1613 setCondCodeAction(ISD::SETLE, VT, Custom);
1614 }
1615
1616 if (Subtarget.hasDQI()) {
1617 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1618 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1619 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
1620 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
1621 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1622 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1623 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
1624 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
1625
1626 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1627 }
1628
1629 if (Subtarget.hasCDI()) {
1630 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1631 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1632 setOperationAction(ISD::CTLZ, VT, Legal);
1633 }
1634 } // Subtarget.hasCDI()
1635
1636 if (Subtarget.hasVPOPCNTDQ()) {
1637 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1638 setOperationAction(ISD::CTPOP, VT, Legal);
1639 }
1640
1641 // Extract subvector is special because the value type
1642 // (result) is 256-bit but the source is 512-bit wide.
1643 // 128-bit was made Legal under AVX1.
1644 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1645 MVT::v8f32, MVT::v4f64 })
1646 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1647
1648 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1649 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1650 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1651 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1652 setOperationAction(ISD::VSELECT, VT, Custom);
1653 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1654 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1655 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1656 setOperationAction(ISD::MLOAD, VT, Legal);
1657 setOperationAction(ISD::MSTORE, VT, Legal);
1658 setOperationAction(ISD::MGATHER, VT, Custom);
1659 setOperationAction(ISD::MSCATTER, VT, Custom);
1660 }
1661 if (!Subtarget.hasBWI()) {
1662 // Need to custom split v32i16/v64i8 bitcasts.
1663 setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
1664 setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
1665
1666 // Better to split these into two 256-bit ops.
1667 setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
1668 setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
1669 }
1670
1671 if (Subtarget.hasVBMI2()) {
1672 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1673 setOperationAction(ISD::FSHL, VT, Custom);
1674 setOperationAction(ISD::FSHR, VT, Custom);
1675 }
1676 }
1677 }// has AVX-512
1678
1679 // This block controls legalization for operations that don't have
1680 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1681 // narrower widths.
1682 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1683 // These operations are handled on non-VLX by artificially widening in
1684 // isel patterns.
1685
1686 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
1687 Subtarget.hasVLX() ? Legal : Custom);
1688 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
1689 Subtarget.hasVLX() ? Legal : Custom);
1690 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1691 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
1692 Subtarget.hasVLX() ? Legal : Custom);
1693 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
1694 Subtarget.hasVLX() ? Legal : Custom);
1695 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
1696 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
1697 Subtarget.hasVLX() ? Legal : Custom);
1698 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
1699 Subtarget.hasVLX() ? Legal : Custom);
1700 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
1701 Subtarget.hasVLX() ? Legal : Custom);
1702 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
1703 Subtarget.hasVLX() ? Legal : Custom);
1704
1705 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1706 setOperationAction(ISD::SMAX, VT, Legal);
1707 setOperationAction(ISD::UMAX, VT, Legal);
1708 setOperationAction(ISD::SMIN, VT, Legal);
1709 setOperationAction(ISD::UMIN, VT, Legal);
1710 setOperationAction(ISD::ABS, VT, Legal);
1711 }
1712
1713 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1714 setOperationAction(ISD::ROTL, VT, Custom);
1715 setOperationAction(ISD::ROTR, VT, Custom);
1716 }
1717
1718 // Custom legalize 2x32 to get a little better code.
1719 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1720 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1721
1722 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1723 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1724 setOperationAction(ISD::MSCATTER, VT, Custom);
1725
1726 if (Subtarget.hasDQI()) {
1727 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1728 setOperationAction(ISD::SINT_TO_FP, VT,
1729 Subtarget.hasVLX() ? Legal : Custom);
1730 setOperationAction(ISD::UINT_TO_FP, VT,
1731 Subtarget.hasVLX() ? Legal : Custom);
1732 setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
1733 Subtarget.hasVLX() ? Legal : Custom);
1734 setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
1735 Subtarget.hasVLX() ? Legal : Custom);
1736 setOperationAction(ISD::FP_TO_SINT, VT,
1737 Subtarget.hasVLX() ? Legal : Custom);
1738 setOperationAction(ISD::FP_TO_UINT, VT,
1739 Subtarget.hasVLX() ? Legal : Custom);
1740 setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
1741 Subtarget.hasVLX() ? Legal : Custom);
1742 setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
1743 Subtarget.hasVLX() ? Legal : Custom);
1744 setOperationAction(ISD::MUL, VT, Legal);
1745 }
1746 }
1747
1748 if (Subtarget.hasCDI()) {
1749 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1750 setOperationAction(ISD::CTLZ, VT, Legal);
1751 }
1752 } // Subtarget.hasCDI()
1753
1754 if (Subtarget.hasVPOPCNTDQ()) {
1755 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1756 setOperationAction(ISD::CTPOP, VT, Legal);
1757 }
1758 }
1759
1760 // This block control legalization of v32i1/v64i1 which are available with
1761 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1762 // useBWIRegs.
1763 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1764 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1765 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1766
1767 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1768 setOperationAction(ISD::ADD, VT, Custom);
1769 setOperationAction(ISD::SUB, VT, Custom);
1770 setOperationAction(ISD::MUL, VT, Custom);
1771 setOperationAction(ISD::VSELECT, VT, Expand);
1772 setOperationAction(ISD::UADDSAT, VT, Custom);
1773 setOperationAction(ISD::SADDSAT, VT, Custom);
1774 setOperationAction(ISD::USUBSAT, VT, Custom);
1775 setOperationAction(ISD::SSUBSAT, VT, Custom);
1776
1777 setOperationAction(ISD::TRUNCATE, VT, Custom);
1778 setOperationAction(ISD::SETCC, VT, Custom);
1779 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1780 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1781 setOperationAction(ISD::SELECT, VT, Custom);
1782 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1783 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1784 }
1785
1786 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1787 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1788 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1789 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1790 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1791 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1792
1793 // Extends from v32i1 masks to 256-bit vectors.
1794 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1795 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1796 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1797 }
1798
1799 // This block controls legalization for v32i16 and v64i8. 512-bits can be
1800 // disabled based on prefer-vector-width and required-vector-width function
1801 // attributes.
1802 if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1803 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1805
1806 // Extends from v64i1 masks to 512-bit vectors.
1807 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1808 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1809 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1810
1811 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1812 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1813 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1814 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1815 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1816 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1817 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1818 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1819 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1820 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1821 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1822 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1823 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1824 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1825 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1826 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1827 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1828 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1829 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1830 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1831 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1832 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1833 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1834
1835 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1836 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1837
1838 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1839
1840 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1841 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1842 setOperationAction(ISD::VSELECT, VT, Custom);
1843 setOperationAction(ISD::ABS, VT, Legal);
1844 setOperationAction(ISD::SRL, VT, Custom);
1845 setOperationAction(ISD::SHL, VT, Custom);
1846 setOperationAction(ISD::SRA, VT, Custom);
1847 setOperationAction(ISD::MLOAD, VT, Legal);
1848 setOperationAction(ISD::MSTORE, VT, Legal);
1849 setOperationAction(ISD::CTPOP, VT, Custom);
1850 setOperationAction(ISD::CTLZ, VT, Custom);
1851 setOperationAction(ISD::SMAX, VT, Legal);
1852 setOperationAction(ISD::UMAX, VT, Legal);
1853 setOperationAction(ISD::SMIN, VT, Legal);
1854 setOperationAction(ISD::UMIN, VT, Legal);
1855 setOperationAction(ISD::SETCC, VT, Custom);
1856 setOperationAction(ISD::UADDSAT, VT, Legal);
1857 setOperationAction(ISD::SADDSAT, VT, Legal);
1858 setOperationAction(ISD::USUBSAT, VT, Legal);
1859 setOperationAction(ISD::SSUBSAT, VT, Legal);
1860 setOperationAction(ISD::SELECT, VT, Custom);
1861
1862 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1863 // setcc all the way to isel and prefer SETGT in some isel patterns.
1864 setCondCodeAction(ISD::SETLT, VT, Custom);
1865 setCondCodeAction(ISD::SETLE, VT, Custom);
1866 }
1867
1868 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1869 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1870 }
1871
1872 if (Subtarget.hasBITALG()) {
1873 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1874 setOperationAction(ISD::CTPOP, VT, Legal);
1875 }
1876
1877 if (Subtarget.hasVBMI2()) {
1878 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1879 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1880 }
1881 }
1882
1883 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1884 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1885 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1886 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1887 }
1888
1889 // These operations are handled on non-VLX by artificially widening in
1890 // isel patterns.
1891 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1892
1893 if (Subtarget.hasBITALG()) {
1894 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1895 setOperationAction(ISD::CTPOP, VT, Legal);
1896 }
1897 }
1898
1899 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1900 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1901 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1902 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1903 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1904 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1905
1906 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1907 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1908 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1909 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1910 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1911
1912 if (Subtarget.hasDQI()) {
1913 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1914 // v2f32 UINT_TO_FP is already custom under SSE2.
1915 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1917, __PRETTY_FUNCTION__))
1916 isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1917, __PRETTY_FUNCTION__))
1917 "Unexpected operation action!")((isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom
(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"
) ? static_cast<void> (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 1917, __PRETTY_FUNCTION__))
;
1918 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1919 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1920 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1921 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
1922 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
1923 }
1924
1925 if (Subtarget.hasBWI()) {
1926 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1927 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1928 }
1929
1930 if (Subtarget.hasVBMI2()) {
1931 // TODO: Make these legal even without VLX?
1932 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1933 MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1934 setOperationAction(ISD::FSHL, VT, Custom);
1935 setOperationAction(ISD::FSHR, VT, Custom);
1936 }
1937 }
1938
1939 setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
1940 setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
1941 setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
1942 }
1943
1944 // We want to custom lower some of our intrinsics.
1945 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1946 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1947 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1948 if (!Subtarget.is64Bit()) {
1949 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1950 }
1951
1952 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1953 // handle type legalization for these operations here.
1954 //
1955 // FIXME: We really should do custom legalization for addition and
1956 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1957 // than generic legalization for 64-bit multiplication-with-overflow, though.
1958 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1959 if (VT == MVT::i64 && !Subtarget.is64Bit())
1960 continue;
1961 // Add/Sub/Mul with overflow operations are custom lowered.
1962 setOperationAction(ISD::SADDO, VT, Custom);
1963 setOperationAction(ISD::UADDO, VT, Custom);
1964 setOperationAction(ISD::SSUBO, VT, Custom);
1965 setOperationAction(ISD::USUBO, VT, Custom);
1966 setOperationAction(ISD::SMULO, VT, Custom);
1967 setOperationAction(ISD::UMULO, VT, Custom);
1968
1969 // Support carry in as value rather than glue.
1970 setOperationAction(ISD::ADDCARRY, VT, Custom);
1971 setOperationAction(ISD::SUBCARRY, VT, Custom);
1972 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1973 }
1974
1975 if (!Subtarget.is64Bit()) {
1976 // These libcalls are not available in 32-bit.
1977 setLibcallName(RTLIB::SHL_I128, nullptr);
1978 setLibcallName(RTLIB::SRL_I128, nullptr);
1979 setLibcallName(RTLIB::SRA_I128, nullptr);
1980 setLibcallName(RTLIB::MUL_I128, nullptr);
1981 }
1982
1983 // Combine sin / cos into _sincos_stret if it is available.
1984 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1985 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1986 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1987 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1988 }
1989
1990 if (Subtarget.isTargetWin64()) {
1991 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1992 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1993 setOperationAction(ISD::SREM, MVT::i128, Custom);
1994 setOperationAction(ISD::UREM, MVT::i128, Custom);
1995 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1996 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1997 }
1998
1999 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2000 // is. We should promote the value to 64-bits to solve this.
2001 // This is what the CRT headers do - `fmodf` is an inline header
2002 // function casting to f64 and calling `fmod`.
2003 if (Subtarget.is32Bit() &&
2004 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2005 for (ISD::NodeType Op :
2006 {ISD::FCEIL, ISD::STRICT_FCEIL,
2007 ISD::FCOS, ISD::STRICT_FCOS,
2008 ISD::FEXP, ISD::STRICT_FEXP,
2009 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2010 ISD::FREM, ISD::STRICT_FREM,
2011 ISD::FLOG, ISD::STRICT_FLOG,
2012 ISD::FLOG10, ISD::STRICT_FLOG10,
2013 ISD::FPOW, ISD::STRICT_FPOW,
2014 ISD::FSIN, ISD::STRICT_FSIN})
2015 if (isOperationExpand(Op, MVT::f32))
2016 setOperationAction(Op, MVT::f32, Promote);
2017
2018 // We have target-specific dag combine patterns for the following nodes:
2019 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
2020 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
2021 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
2022 setTargetDAGCombine(ISD::CONCAT_VECTORS);
2023 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
2024 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
2025 setTargetDAGCombine(ISD::BITCAST);
2026 setTargetDAGCombine(ISD::VSELECT);
2027 setTargetDAGCombine(ISD::SELECT);
2028 setTargetDAGCombine(ISD::SHL);
2029 setTargetDAGCombine(ISD::SRA);
2030 setTargetDAGCombine(ISD::SRL);
2031 setTargetDAGCombine(ISD::OR);
2032 setTargetDAGCombine(ISD::AND);
2033 setTargetDAGCombine(ISD::ADD);
2034 setTargetDAGCombine(ISD::FADD);
2035 setTargetDAGCombine(ISD::FSUB);
2036 setTargetDAGCombine(ISD::FNEG);
2037 setTargetDAGCombine(ISD::FMA);
2038 setTargetDAGCombine(ISD::STRICT_FMA);
2039 setTargetDAGCombine(ISD::FMINNUM);
2040 setTargetDAGCombine(ISD::FMAXNUM);
2041 setTargetDAGCombine(ISD::SUB);
2042 setTargetDAGCombine(ISD::LOAD);
2043 setTargetDAGCombine(ISD::MLOAD);
2044 setTargetDAGCombine(ISD::STORE);
2045 setTargetDAGCombine(ISD::MSTORE);
2046 setTargetDAGCombine(ISD::TRUNCATE);
2047 setTargetDAGCombine(ISD::ZERO_EXTEND);
2048 setTargetDAGCombine(ISD::ANY_EXTEND);
2049 setTargetDAGCombine(ISD::SIGN_EXTEND);
2050 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
2051 setTargetDAGCombine(ISD::ANY_EXTEND_VECTOR_INREG);
2052 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
2053 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
2054 setTargetDAGCombine(ISD::SINT_TO_FP);
2055 setTargetDAGCombine(ISD::UINT_TO_FP);
2056 setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
2057 setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
2058 setTargetDAGCombine(ISD::SETCC);
2059 setTargetDAGCombine(ISD::MUL);
2060 setTargetDAGCombine(ISD::XOR);
2061 setTargetDAGCombine(ISD::MSCATTER);
2062 setTargetDAGCombine(ISD::MGATHER);
2063 setTargetDAGCombine(ISD::FP16_TO_FP);
2064 setTargetDAGCombine(ISD::FP_EXTEND);
2065 setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
2066 setTargetDAGCombine(ISD::FP_ROUND);
2067
2068 computeRegisterProperties(Subtarget.getRegisterInfo());
2069
2070 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2071 MaxStoresPerMemsetOptSize = 8;
2072 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2073 MaxStoresPerMemcpyOptSize = 4;
2074 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2075 MaxStoresPerMemmoveOptSize = 4;
2076
2077 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2078 // that needs to benchmarked and balanced with the potential use of vector
2079 // load/store types (PR33329, PR33914).
2080 MaxLoadsPerMemcmp = 2;
2081 MaxLoadsPerMemcmpOptSize = 2;
2082
2083 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2084 setPrefLoopAlignment(Align(1ULL << ExperimentalPrefLoopAlignment));
2085
2086 // An out-of-order CPU can speculatively execute past a predictable branch,
2087 // but a conditional move could be stalled by an expensive earlier operation.
2088 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2089 EnableExtLdPromotion = true;
2090 setPrefFunctionAlignment(Align(16));
2091
2092 verifyIntrinsicTables();
2093
2094 // Default to having -disable-strictnode-mutation on
2095 IsStrictFPEnabled = true;
2096}
2097
2098// This has so far only been implemented for 64-bit MachO.
2099bool X86TargetLowering::useLoadStackGuardNode() const {
2100 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2101}
2102
2103bool X86TargetLowering::useStackGuardXorFP() const {
2104 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2105 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2106}
2107
2108SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
2109 const SDLoc &DL) const {
2110 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2111 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2112 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2113 return SDValue(Node, 0);
2114}
2115
2116TargetLoweringBase::LegalizeTypeAction
2117X86TargetLowering::getPreferredVectorAction(MVT VT) const {
2118 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
2119 return TypeSplitVector;
2120
2121 if (VT.getVectorNumElements() != 1 &&
2122 VT.getVectorElementType() != MVT::i1)
2123 return TypeWidenVector;
2124
2125 return TargetLoweringBase::getPreferredVectorAction(VT);
2126}
2127
2128static std::pair<MVT, unsigned>
2129handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
2130 const X86Subtarget &Subtarget) {
2131 // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2132 // convention is one that uses k registers.
2133 if (NumElts == 2)
2134 return {MVT::v2i64, 1};
2135 if (NumElts == 4)
2136 return {MVT::v4i32, 1};
2137 if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2138 CC != CallingConv::Intel_OCL_BI)
2139 return {MVT::v8i16, 1};
2140 if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2141 CC != CallingConv::Intel_OCL_BI)
2142 return {MVT::v16i8, 1};
2143 // v32i1 passes in ymm unless we have BWI and the calling convention is
2144 // regcall.
2145 if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2146 return {MVT::v32i8, 1};
2147 // Split v64i1 vectors if we don't have v64i8 available.
2148 if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2149 if (Subtarget.useAVX512Regs())
2150 return {MVT::v64i8, 1};
2151 return {MVT::v32i8, 2};
2152 }
2153
2154 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2155 if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2156 NumElts > 64)
2157 return {MVT::i8, NumElts};
2158
2159 return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2160}
2161
2162MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
2163 CallingConv::ID CC,
2164 EVT VT) const {
2165 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2166 Subtarget.hasAVX512()) {
2167 unsigned NumElts = VT.getVectorNumElements();
2168
2169 MVT RegisterVT;
2170 unsigned NumRegisters;
2171 std::tie(RegisterVT, NumRegisters) =
2172 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2173 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2174 return RegisterVT;
2175 }
2176
2177 // FIXME: Should we just make these types legal and custom split operations?
2178 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
2179 Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
2180 return MVT::v16i32;
2181
2182 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
2183}
2184
2185unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
2186 CallingConv::ID CC,
2187 EVT VT) const {
2188 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2189 Subtarget.hasAVX512()) {
2190 unsigned NumElts = VT.getVectorNumElements();
2191
2192 MVT RegisterVT;
2193 unsigned NumRegisters;
2194 std::tie(RegisterVT, NumRegisters) =
2195 handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2196 if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2197 return NumRegisters;
2198 }
2199
2200 // FIXME: Should we just make these types legal and custom split operations?
2201 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
2202 Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
2203 return 1;
2204
2205 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
2206}
2207
2208unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
2209 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2210 unsigned &NumIntermediates, MVT &RegisterVT) const {
2211 // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2212 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2213 Subtarget.hasAVX512() &&
2214 (!isPowerOf2_32(VT.getVectorNumElements()) ||
2215 (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2216 VT.getVectorNumElements() > 64)) {
2217 RegisterVT = MVT::i8;
2218 IntermediateVT = MVT::i1;
2219 NumIntermediates = VT.getVectorNumElements();
2220 return NumIntermediates;
2221 }
2222
2223 // Split v64i1 vectors if we don't have v64i8 available.
2224 if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2225 CC != CallingConv::X86_RegCall) {
2226 RegisterVT = MVT::v32i8;
2227 IntermediateVT = MVT::v32i1;
2228 NumIntermediates = 2;
2229 return 2;
2230 }
2231
2232 return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2233 NumIntermediates, RegisterVT);
2234}
2235
2236EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
2237 LLVMContext& Context,
2238 EVT VT) const {
2239 if (!VT.isVector())
2240 return MVT::i8;
2241
2242 if (Subtarget.hasAVX512()) {
2243 const unsigned NumElts = VT.getVectorNumElements();
2244
2245 // Figure out what this type will be legalized to.
2246 EVT LegalVT = VT;
2247 while (getTypeAction(Context, LegalVT) != TypeLegal)
2248 LegalVT = getTypeToTransformTo(Context, LegalVT);
2249
2250 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2251 if (LegalVT.getSimpleVT().is512BitVector())
2252 return EVT::getVectorVT(Context, MVT::i1, NumElts);
2253
2254 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2255 // If we legalized to less than a 512-bit vector, then we will use a vXi1
2256 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2257 // vXi16/vXi8.
2258 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2259 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2260 return EVT::getVectorVT(Context, MVT::i1, NumElts);
2261 }
2262 }
2263
2264 return VT.changeVectorElementTypeToInteger();
2265}
2266
2267/// Helper for getByValTypeAlignment to determine
2268/// the desired ByVal argument alignment.
2269static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
2270 if (MaxAlign == 16)
2271 return;
2272 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2273 if (VTy->getBitWidth() == 128)
2274 MaxAlign = 16;
2275 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2276 unsigned EltAlign = 0;
2277 getMaxByValAlign(ATy->getElementType(), EltAlign);
2278 if (EltAlign > MaxAlign)
2279 MaxAlign = EltAlign;
2280 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2281 for (auto *EltTy : STy->elements()) {
2282 unsigned EltAlign = 0;
2283 getMaxByValAlign(EltTy, EltAlign);
2284 if (EltAlign > MaxAlign)
2285 MaxAlign = EltAlign;
2286 if (MaxAlign == 16)
2287 break;
2288 }
2289 }
2290}
2291
2292/// Return the desired alignment for ByVal aggregate
2293/// function arguments in the caller parameter area. For X86, aggregates
2294/// that contain SSE vectors are placed at 16-byte boundaries while the rest
2295/// are at 4-byte boundaries.
2296unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
2297 const DataLayout &DL) const {
2298 if (Subtarget.is64Bit()) {
2299 // Max of 8 and alignment of type.
2300 unsigned TyAlign = DL.getABITypeAlignment(Ty);
2301 if (TyAlign > 8)
2302 return TyAlign;
2303 return 8;
2304 }
2305
2306 unsigned Align = 4;
2307 if (Subtarget.hasSSE1())
2308 getMaxByValAlign(Ty, Align);
2309 return Align;
2310}
2311
2312/// It returns EVT::Other if the type should be determined using generic
2313/// target-independent logic.
2314/// For vector ops we check that the overall size isn't larger than our
2315/// preferred vector width.
2316EVT X86TargetLowering::getOptimalMemOpType(
2317 const MemOp &Op, const AttributeList &FuncAttributes) const {
2318 if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2319 if (Op.size() >= 16 &&
2320 (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2321 // FIXME: Check if unaligned 64-byte accesses are slow.
2322 if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2323 (Subtarget.getPreferVectorWidth() >= 512)) {
2324 return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2325 }
2326 // FIXME: Check if unaligned 32-byte accesses are slow.
2327 if (Op.size() >= 32 && Subtarget.hasAVX() &&
2328 (Subtarget.getPreferVectorWidth() >= 256)) {
2329 // Although this isn't a well-supported type for AVX1, we'll let
2330 // legalization and shuffle lowering produce the optimal codegen. If we
2331 // choose an optimal type with a vector element larger than a byte,
2332 // getMemsetStores() may create an intermediate splat (using an integer
2333 // multiply) before we splat as a vector.
2334 return MVT::v32i8;
2335 }
2336 if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2337 return MVT::v16i8;
2338 // TODO: Can SSE1 handle a byte vector?
2339 // If we have SSE1 registers we should be able to use them.
2340 if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2341 (Subtarget.getPreferVectorWidth() >= 128))
2342 return MVT::v4f32;
2343 } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2344 Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2345 // Do not use f64 to lower memcpy if source is string constant. It's
2346 // better to use i32 to avoid the loads.
2347 // Also, do not use f64 to lower memset unless this is a memset of zeros.
2348 // The gymnastics of splatting a byte value into an XMM register and then
2349 // only using 8-byte stores (because this is a CPU with slow unaligned
2350 // 16-byte accesses) makes that a loser.
2351 return MVT::f64;
2352 }
2353 }
2354 // This is a compromise. If we reach here, unaligned accesses may be slow on
2355 // this target. However, creating smaller, aligned accesses could be even
2356 // slower and would certainly be a lot more code.
2357 if (Subtarget.is64Bit() && Op.size() >= 8)
2358 return MVT::i64;
2359 return MVT::i32;
2360}
2361
2362bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
2363 if (VT == MVT::f32)
2364 return X86ScalarSSEf32;
2365 else if (VT == MVT::f64)
2366 return X86ScalarSSEf64;
2367 return true;
2368}
2369
2370bool X86TargetLowering::allowsMisalignedMemoryAccesses(
2371 EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
2372 bool *Fast) const {
2373 if (Fast) {
2374 switch (VT.getSizeInBits()) {
2375 default:
2376 // 8-byte and under are always assumed to be fast.
2377 *Fast = true;
2378 break;
2379 case 128:
2380 *Fast = !Subtarget.isUnalignedMem16Slow();
2381 break;
2382 case 256:
2383 *Fast = !Subtarget.isUnalignedMem32Slow();
2384 break;
2385 // TODO: What about AVX-512 (512-bit) accesses?
2386 }
2387 }
2388 // NonTemporal vector memory ops must be aligned.
2389 if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2390 // NT loads can only be vector aligned, so if its less aligned than the
2391 // minimum vector size (which we can split the vector down to), we might as
2392 // well use a regular unaligned vector load.
2393 // We don't have any NT loads pre-SSE41.
2394 if (!!(Flags & MachineMemOperand::MOLoad))
2395 return (Align < 16 || !Subtarget.hasSSE41());
2396 return false;
2397 }
2398 // Misaligned accesses of any size are always allowed.
2399 return true;
2400}
2401
2402/// Return the entry encoding for a jump table in the
2403/// current function. The returned value is a member of the
2404/// MachineJumpTableInfo::JTEntryKind enum.
2405unsigned X86TargetLowering::getJumpTableEncoding() const {
2406 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2407 // symbol.
2408 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2409 return MachineJumpTableInfo::EK_Custom32;
2410
2411 // Otherwise, use the normal jump table encoding heuristics.
2412 return TargetLowering::getJumpTableEncoding();
2413}
2414
2415bool X86TargetLowering::useSoftFloat() const {
2416 return Subtarget.useSoftFloat();
2417}
2418
2419void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2420 ArgListTy &Args) const {
2421
2422 // Only relabel X86-32 for C / Stdcall CCs.
2423 if (Subtarget.is64Bit())
2424 return;
2425 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2426 return;
2427 unsigned ParamRegs = 0;
2428 if (auto *M = MF->getFunction().getParent())
2429 ParamRegs = M->getNumberRegisterParameters();
2430
2431 // Mark the first N int arguments as having reg
2432 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2433 Type *T = Args[Idx].Ty;
2434 if (T->isIntOrPtrTy())
2435 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2436 unsigned numRegs = 1;
2437 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2438 numRegs = 2;
2439 if (ParamRegs < numRegs)
2440 return;
2441 ParamRegs -= numRegs;
2442 Args[Idx].IsInReg = true;
2443 }
2444 }
2445}
2446
2447const MCExpr *
2448X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2449 const MachineBasicBlock *MBB,
2450 unsigned uid,MCContext &Ctx) const{
2451 assert(isPositionIndependent() && Subtarget.isPICStyleGOT())((isPositionIndependent() && Subtarget.isPICStyleGOT(
)) ? static_cast<void> (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2451, __PRETTY_FUNCTION__))
;
2452 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2453 // entries.
2454 return MCSymbolRefExpr::create(MBB->getSymbol(),
2455 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2456}
2457
2458/// Returns relocation base for the given PIC jumptable.
2459SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2460 SelectionDAG &DAG) const {
2461 if (!Subtarget.is64Bit())
2462 // This doesn't have SDLoc associated with it, but is not really the
2463 // same as a Register.
2464 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2465 getPointerTy(DAG.getDataLayout()));
2466 return Table;
2467}
2468
2469/// This returns the relocation base for the given PIC jumptable,
2470/// the same as getPICJumpTableRelocBase, but as an MCExpr.
2471const MCExpr *X86TargetLowering::
2472getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2473 MCContext &Ctx) const {
2474 // X86-64 uses RIP relative addressing based on the jump table label.
2475 if (Subtarget.isPICStyleRIPRel())
2476 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2477
2478 // Otherwise, the reference is relative to the PIC base.
2479 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2480}
2481
2482std::pair<const TargetRegisterClass *, uint8_t>
2483X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2484 MVT VT) const {
2485 const TargetRegisterClass *RRC = nullptr;
2486 uint8_t Cost = 1;
2487 switch (VT.SimpleTy) {
2488 default:
2489 return TargetLowering::findRepresentativeClass(TRI, VT);
2490 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2491 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2492 break;
2493 case MVT::x86mmx:
2494 RRC = &X86::VR64RegClass;
2495 break;
2496 case MVT::f32: case MVT::f64:
2497 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2498 case MVT::v4f32: case MVT::v2f64:
2499 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2500 case MVT::v8f32: case MVT::v4f64:
2501 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2502 case MVT::v16f32: case MVT::v8f64:
2503 RRC = &X86::VR128XRegClass;
2504 break;
2505 }
2506 return std::make_pair(RRC, Cost);
2507}
2508
2509unsigned X86TargetLowering::getAddressSpace() const {
2510 if (Subtarget.is64Bit())
2511 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2512 return 256;
2513}
2514
2515static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2516 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2517 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2518}
2519
2520static Constant* SegmentOffset(IRBuilder<> &IRB,
2521 unsigned Offset, unsigned AddressSpace) {
2522 return ConstantExpr::getIntToPtr(
2523 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2524 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2525}
2526
2527Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2528 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2529 // tcbhead_t; use it instead of the usual global variable (see
2530 // sysdeps/{i386,x86_64}/nptl/tls.h)
2531 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2532 if (Subtarget.isTargetFuchsia()) {
2533 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2534 return SegmentOffset(IRB, 0x10, getAddressSpace());
2535 } else {
2536 // %fs:0x28, unless we're using a Kernel code model, in which case
2537 // it's %gs:0x28. gs:0x14 on i386.
2538 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2539 return SegmentOffset(IRB, Offset, getAddressSpace());
2540 }
2541 }
2542
2543 return TargetLowering::getIRStackGuard(IRB);
2544}
2545
2546void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2547 // MSVC CRT provides functionalities for stack protection.
2548 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2549 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2550 // MSVC CRT has a global variable holding security cookie.
2551 M.getOrInsertGlobal("__security_cookie",
2552 Type::getInt8PtrTy(M.getContext()));
2553
2554 // MSVC CRT has a function to validate security cookie.
2555 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2556 "__security_check_cookie", Type::getVoidTy(M.getContext()),
2557 Type::getInt8PtrTy(M.getContext()));
2558 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2559 F->setCallingConv(CallingConv::X86_FastCall);
2560 F->addAttribute(1, Attribute::AttrKind::InReg);
2561 }
2562 return;
2563 }
2564 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2565 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2566 return;
2567 TargetLowering::insertSSPDeclarations(M);
2568}
2569
2570Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2571 // MSVC CRT has a global variable holding security cookie.
2572 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2573 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2574 return M.getGlobalVariable("__security_cookie");
2575 }
2576 return TargetLowering::getSDagStackGuard(M);
2577}
2578
2579Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2580 // MSVC CRT has a function to validate security cookie.
2581 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2582 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2583 return M.getFunction("__security_check_cookie");
2584 }
2585 return TargetLowering::getSSPStackGuardCheck(M);
2586}
2587
2588Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2589 if (Subtarget.getTargetTriple().isOSContiki())
2590 return getDefaultSafeStackPointerLocation(IRB, false);
2591
2592 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2593 // definition of TLS_SLOT_SAFESTACK in
2594 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2595 if (Subtarget.isTargetAndroid()) {
2596 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2597 // %gs:0x24 on i386
2598 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2599 return SegmentOffset(IRB, Offset, getAddressSpace());
2600 }
2601
2602 // Fuchsia is similar.
2603 if (Subtarget.isTargetFuchsia()) {
2604 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2605 return SegmentOffset(IRB, 0x18, getAddressSpace());
2606 }
2607
2608 return TargetLowering::getSafeStackPointerLocation(IRB);
2609}
2610
2611bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2612 unsigned DestAS) const {
2613 assert(SrcAS != DestAS && "Expected different address spaces!")((SrcAS != DestAS && "Expected different address spaces!"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != DestAS && \"Expected different address spaces!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2613, __PRETTY_FUNCTION__))
;
2614
2615 const TargetMachine &TM = getTargetMachine();
2616 if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
2617 return false;
2618
2619 return SrcAS < 256 && DestAS < 256;
2620}
2621
2622//===----------------------------------------------------------------------===//
2623// Return Value Calling Convention Implementation
2624//===----------------------------------------------------------------------===//
2625
2626bool X86TargetLowering::CanLowerReturn(
2627 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2628 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2629 SmallVector<CCValAssign, 16> RVLocs;
2630 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2631 return CCInfo.CheckReturn(Outs, RetCC_X86);
2632}
2633
2634const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2635 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2636 return ScratchRegs;
2637}
2638
2639/// Lowers masks values (v*i1) to the local register values
2640/// \returns DAG node after lowering to register type
2641static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2642 const SDLoc &Dl, SelectionDAG &DAG) {
2643 EVT ValVT = ValArg.getValueType();
2644
2645 if (ValVT == MVT::v1i1)
2646 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2647 DAG.getIntPtrConstant(0, Dl));
2648
2649 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2650 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2651 // Two stage lowering might be required
2652 // bitcast: v8i1 -> i8 / v16i1 -> i16
2653 // anyextend: i8 -> i32 / i16 -> i32
2654 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2655 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2656 if (ValLoc == MVT::i32)
2657 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2658 return ValToCopy;
2659 }
2660
2661 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2662 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2663 // One stage lowering is required
2664 // bitcast: v32i1 -> i32 / v64i1 -> i64
2665 return DAG.getBitcast(ValLoc, ValArg);
2666 }
2667
2668 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2669}
2670
2671/// Breaks v64i1 value into two registers and adds the new node to the DAG
2672static void Passv64i1ArgInRegs(
2673 const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2674 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
2675 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2676 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2676, __PRETTY_FUNCTION__))
;
2677 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2677, __PRETTY_FUNCTION__))
;
2678 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")((Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"
) ? static_cast<void> (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2678, __PRETTY_FUNCTION__))
;
2679 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2680, __PRETTY_FUNCTION__))
2680 "The value should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The value should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2680, __PRETTY_FUNCTION__))
;
2681
2682 // Before splitting the value we cast it to i64
2683 Arg = DAG.getBitcast(MVT::i64, Arg);
2684
2685 // Splitting the value into two i32 types
2686 SDValue Lo, Hi;
2687 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2688 DAG.getConstant(0, Dl, MVT::i32));
2689 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2690 DAG.getConstant(1, Dl, MVT::i32));
2691
2692 // Attach the two i32 types into corresponding registers
2693 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2694 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2695}
2696
2697SDValue
2698X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2699 bool isVarArg,
2700 const SmallVectorImpl<ISD::OutputArg> &Outs,
2701 const SmallVectorImpl<SDValue> &OutVals,
2702 const SDLoc &dl, SelectionDAG &DAG) const {
2703 MachineFunction &MF = DAG.getMachineFunction();
2704 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2705
2706 // In some cases we need to disable registers from the default CSR list.
2707 // For example, when they are used for argument passing.
2708 bool ShouldDisableCalleeSavedRegister =
2709 CallConv == CallingConv::X86_RegCall ||
2710 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2711
2712 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2713 report_fatal_error("X86 interrupts may not return any value");
2714
2715 SmallVector<CCValAssign, 16> RVLocs;
2716 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2717 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2718
2719 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
2720 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2721 ++I, ++OutsIndex) {
2722 CCValAssign &VA = RVLocs[I];
2723 assert(VA.isRegLoc() && "Can only return in registers!")((VA.isRegLoc() && "Can only return in registers!") ?
static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2723, __PRETTY_FUNCTION__))
;
2724
2725 // Add the register to the CalleeSaveDisableRegs list.
2726 if (ShouldDisableCalleeSavedRegister)
2727 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2728
2729 SDValue ValToCopy = OutVals[OutsIndex];
2730 EVT ValVT = ValToCopy.getValueType();
2731
2732 // Promote values to the appropriate types.
2733 if (VA.getLocInfo() == CCValAssign::SExt)
2734 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2735 else if (VA.getLocInfo() == CCValAssign::ZExt)
2736 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2737 else if (VA.getLocInfo() == CCValAssign::AExt) {
2738 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2739 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2740 else
2741 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2742 }
2743 else if (VA.getLocInfo() == CCValAssign::BCvt)
2744 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2745
2746 assert(VA.getLocInfo() != CCValAssign::FPExt &&((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2747, __PRETTY_FUNCTION__))
2747 "Unexpected FP-extend for return value.")((VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."
) ? static_cast<void> (0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2747, __PRETTY_FUNCTION__))
;
2748
2749 // Report an error if we have attempted to return a value via an XMM
2750 // register and SSE was disabled.
2751 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2752 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2753 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2754 } else if (!Subtarget.hasSSE2() &&
2755 X86::FR64XRegClass.contains(VA.getLocReg()) &&
2756 ValVT == MVT::f64) {
2757 // When returning a double via an XMM register, report an error if SSE2 is
2758 // not enabled.
2759 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2760 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2761 }
2762
2763 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2764 // the RET instruction and handled by the FP Stackifier.
2765 if (VA.getLocReg() == X86::FP0 ||
2766 VA.getLocReg() == X86::FP1) {
2767 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2768 // change the value to the FP stack register class.
2769 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2770 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2771 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2772 // Don't emit a copytoreg.
2773 continue;
2774 }
2775
2776 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2777 // which is returned in RAX / RDX.
2778 if (Subtarget.is64Bit()) {
2779 if (ValVT == MVT::x86mmx) {
2780 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2781 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2782 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2783 ValToCopy);
2784 // If we don't have SSE2 available, convert to v4f32 so the generated
2785 // register is legal.
2786 if (!Subtarget.hasSSE2())
2787 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2788 }
2789 }
2790 }
2791
2792 if (VA.needsCustom()) {
2793 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2794, __PRETTY_FUNCTION__))
2794 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2794, __PRETTY_FUNCTION__))
;
2795
2796 Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2797 Subtarget);
2798
2799 // Add the second register to the CalleeSaveDisableRegs list.
2800 if (ShouldDisableCalleeSavedRegister)
2801 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2802 } else {
2803 RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2804 }
2805 }
2806
2807 SDValue Flag;
2808 SmallVector<SDValue, 6> RetOps;
2809 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2810 // Operand #1 = Bytes To Pop
2811 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2812 MVT::i32));
2813
2814 // Copy the result values into the output registers.
2815 for (auto &RetVal : RetVals) {
2816 if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2817 RetOps.push_back(RetVal.second);
2818 continue; // Don't emit a copytoreg.
2819 }
2820
2821 Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2822 Flag = Chain.getValue(1);
2823 RetOps.push_back(
2824 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2825 }
2826
2827 // Swift calling convention does not require we copy the sret argument
2828 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2829
2830 // All x86 ABIs require that for returning structs by value we copy
2831 // the sret argument into %rax/%eax (depending on ABI) for the return.
2832 // We saved the argument into a virtual register in the entry block,
2833 // so now we copy the value out and into %rax/%eax.
2834 //
2835 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2836 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2837 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2838 // either case FuncInfo->setSRetReturnReg() will have been called.
2839 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2840 // When we have both sret and another return value, we should use the
2841 // original Chain stored in RetOps[0], instead of the current Chain updated
2842 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2843
2844 // For the case of sret and another return value, we have
2845 // Chain_0 at the function entry
2846 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2847 // If we use Chain_1 in getCopyFromReg, we will have
2848 // Val = getCopyFromReg(Chain_1)
2849 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2850
2851 // getCopyToReg(Chain_0) will be glued together with
2852 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2853 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2854 // Data dependency from Unit B to Unit A due to usage of Val in
2855 // getCopyToReg(Chain_1, Val)
2856 // Chain dependency from Unit A to Unit B
2857
2858 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2859 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2860 getPointerTy(MF.getDataLayout()));
2861
2862 unsigned RetValReg
2863 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2864 X86::RAX : X86::EAX;
2865 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2866 Flag = Chain.getValue(1);
2867
2868 // RAX/EAX now acts like a return value.
2869 RetOps.push_back(
2870 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2871
2872 // Add the returned register to the CalleeSaveDisableRegs list.
2873 if (ShouldDisableCalleeSavedRegister)
2874 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2875 }
2876
2877 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2878 const MCPhysReg *I =
2879 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2880 if (I) {
2881 for (; *I; ++I) {
2882 if (X86::GR64RegClass.contains(*I))
2883 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2884 else
2885 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2885)
;
2886 }
2887 }
2888
2889 RetOps[0] = Chain; // Update chain.
2890
2891 // Add the flag if we have it.
2892 if (Flag.getNode())
2893 RetOps.push_back(Flag);
2894
2895 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2896 if (CallConv == CallingConv::X86_INTR)
2897 opcode = X86ISD::IRET;
2898 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2899}
2900
2901bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2902 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2903 return false;
2904
2905 SDValue TCChain = Chain;
2906 SDNode *Copy = *N->use_begin();
2907 if (Copy->getOpcode() == ISD::CopyToReg) {
2908 // If the copy has a glue operand, we conservatively assume it isn't safe to
2909 // perform a tail call.
2910 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2911 return false;
2912 TCChain = Copy->getOperand(0);
2913 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2914 return false;
2915
2916 bool HasRet = false;
2917 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2918 UI != UE; ++UI) {
2919 if (UI->getOpcode() != X86ISD::RET_FLAG)
2920 return false;
2921 // If we are returning more than one value, we can definitely
2922 // not make a tail call see PR19530
2923 if (UI->getNumOperands() > 4)
2924 return false;
2925 if (UI->getNumOperands() == 4 &&
2926 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2927 return false;
2928 HasRet = true;
2929 }
2930
2931 if (!HasRet)
2932 return false;
2933
2934 Chain = TCChain;
2935 return true;
2936}
2937
2938EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2939 ISD::NodeType ExtendKind) const {
2940 MVT ReturnMVT = MVT::i32;
2941
2942 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2943 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2944 // The ABI does not require i1, i8 or i16 to be extended.
2945 //
2946 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2947 // always extending i8/i16 return values, so keep doing that for now.
2948 // (PR26665).
2949 ReturnMVT = MVT::i8;
2950 }
2951
2952 EVT MinVT = getRegisterType(Context, ReturnMVT);
2953 return VT.bitsLT(MinVT) ? MinVT : VT;
2954}
2955
2956/// Reads two 32 bit registers and creates a 64 bit mask value.
2957/// \param VA The current 32 bit value that need to be assigned.
2958/// \param NextVA The next 32 bit value that need to be assigned.
2959/// \param Root The parent DAG node.
2960/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2961/// glue purposes. In the case the DAG is already using
2962/// physical register instead of virtual, we should glue
2963/// our new SDValue to InFlag SDvalue.
2964/// \return a new SDvalue of size 64bit.
2965static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2966 SDValue &Root, SelectionDAG &DAG,
2967 const SDLoc &Dl, const X86Subtarget &Subtarget,
2968 SDValue *InFlag = nullptr) {
2969 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2969, __PRETTY_FUNCTION__))
;
2970 assert(Subtarget.is32Bit() && "Expecting 32 bit target")((Subtarget.is32Bit() && "Expecting 32 bit target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2970, __PRETTY_FUNCTION__))
;
2971 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2972, __PRETTY_FUNCTION__))
2972 "Expecting first location of 64 bit width type")((VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2972, __PRETTY_FUNCTION__))
;
2973 assert(NextVA.getValVT() == VA.getValVT() &&((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"
) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2974, __PRETTY_FUNCTION__))
2974 "The locations should have the same type")((NextVA.getValVT() == VA.getValVT() && "The locations should have the same type"
) ? static_cast<void> (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2974, __PRETTY_FUNCTION__))
;
2975 assert(VA.isRegLoc() && NextVA.isRegLoc() &&((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2976, __PRETTY_FUNCTION__))
2976 "The values should reside in two registers")((VA.isRegLoc() && NextVA.isRegLoc() && "The values should reside in two registers"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 2976, __PRETTY_FUNCTION__))
;
2977
2978 SDValue Lo, Hi;
2979 SDValue ArgValueLo, ArgValueHi;
2980
2981 MachineFunction &MF = DAG.getMachineFunction();
2982 const TargetRegisterClass *RC = &X86::GR32RegClass;
2983
2984 // Read a 32 bit value from the registers.
2985 if (nullptr == InFlag) {
2986 // When no physical register is present,
2987 // create an intermediate virtual register.
2988 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2989 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2990 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2991 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2992 } else {
2993 // When a physical register is available read the value from it and glue
2994 // the reads together.
2995 ArgValueLo =
2996 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2997 *InFlag = ArgValueLo.getValue(2);
2998 ArgValueHi =
2999 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
3000 *InFlag = ArgValueHi.getValue(2);
3001 }
3002
3003 // Convert the i32 type into v32i1 type.
3004 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
3005
3006 // Convert the i32 type into v32i1 type.
3007 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
3008
3009 // Concatenate the two values together.
3010 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
3011}
3012
3013/// The function will lower a register of various sizes (8/16/32/64)
3014/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
3015/// \returns a DAG node contains the operand after lowering to mask type.
3016static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
3017 const EVT &ValLoc, const SDLoc &Dl,
3018 SelectionDAG &DAG) {
3019 SDValue ValReturned = ValArg;
3020
3021 if (ValVT == MVT::v1i1)
3022 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
3023
3024 if (ValVT == MVT::v64i1) {
3025 // In 32 bit machine, this case is handled by getv64i1Argument
3026 assert(ValLoc == MVT::i64 && "Expecting only i64 locations")((ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? static_cast<void> (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3026, __PRETTY_FUNCTION__))
;
3027 // In 64 bit machine, There is no need to truncate the value only bitcast
3028 } else {
3029 MVT maskLen;
3030 switch (ValVT.getSimpleVT().SimpleTy) {
3031 case MVT::v8i1:
3032 maskLen = MVT::i8;
3033 break;
3034 case MVT::v16i1:
3035 maskLen = MVT::i16;
3036 break;
3037 case MVT::v32i1:
3038 maskLen = MVT::i32;
3039 break;
3040 default:
3041 llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3041)
;
3042 }
3043
3044 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3045 }
3046 return DAG.getBitcast(ValVT, ValReturned);
3047}
3048
3049/// Lower the result values of a call into the
3050/// appropriate copies out of appropriate physical registers.
3051///
3052SDValue X86TargetLowering::LowerCallResult(
3053 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3054 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3055 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3056 uint32_t *RegMask) const {
3057
3058 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3059 // Assign locations to each value returned by this call.
3060 SmallVector<CCValAssign, 16> RVLocs;
3061 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3062 *DAG.getContext());
3063 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3064
3065 // Copy all of the result registers out of their specified physreg.
3066 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3067 ++I, ++InsIndex) {
3068 CCValAssign &VA = RVLocs[I];
3069 EVT CopyVT = VA.getLocVT();
3070
3071 // In some calling conventions we need to remove the used registers
3072 // from the register mask.
3073 if (RegMask) {
3074 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3075 SubRegs.isValid(); ++SubRegs)
3076 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3077 }
3078
3079 // Report an error if there was an attempt to return FP values via XMM
3080 // registers.
3081 if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3082 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3083 if (VA.getLocReg() == X86::XMM1)
3084 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3085 else
3086 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3087 } else if (!Subtarget.hasSSE2() &&
3088 X86::FR64XRegClass.contains(VA.getLocReg()) &&
3089 CopyVT == MVT::f64) {
3090 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3091 if (VA.getLocReg() == X86::XMM1)
3092 VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3093 else
3094 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3095 }
3096
3097 // If we prefer to use the value in xmm registers, copy it out as f80 and
3098 // use a truncate to move it from fp stack reg to xmm reg.
3099 bool RoundAfterCopy = false;
3100 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3101 isScalarFPTypeInSSEReg(VA.getValVT())) {
3102 if (!Subtarget.hasX87())
3103 report_fatal_error("X87 register return with X87 disabled");
3104 CopyVT = MVT::f80;
3105 RoundAfterCopy = (CopyVT != VA.getLocVT());
3106 }
3107
3108 SDValue Val;
3109 if (VA.needsCustom()) {
3110 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3111, __PRETTY_FUNCTION__))
3111 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3111, __PRETTY_FUNCTION__))
;
3112 Val =
3113 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3114 } else {
3115 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3116 .getValue(1);
3117 Val = Chain.getValue(0);
3118 InFlag = Chain.getValue(2);
3119 }
3120
3121 if (RoundAfterCopy)
3122 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3123 // This truncation won't change the value.
3124 DAG.getIntPtrConstant(1, dl));
3125
3126 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
3127 if (VA.getValVT().isVector() &&
3128 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3129 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3130 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3131 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3132 } else
3133 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3134 }
3135
3136 if (VA.getLocInfo() == CCValAssign::BCvt)
3137 Val = DAG.getBitcast(VA.getValVT(), Val);
3138
3139 InVals.push_back(Val);
3140 }
3141
3142 return Chain;
3143}
3144
3145//===----------------------------------------------------------------------===//
3146// C & StdCall & Fast Calling Convention implementation
3147//===----------------------------------------------------------------------===//
3148// StdCall calling convention seems to be standard for many Windows' API
3149// routines and around. It differs from C calling convention just a little:
3150// callee should clean up the stack, not caller. Symbols should be also
3151// decorated in some fancy way :) It doesn't support any vector arguments.
3152// For info on fast calling convention see Fast Calling Convention (tail call)
3153// implementation LowerX86_32FastCCCallTo.
3154
3155/// CallIsStructReturn - Determines whether a call uses struct return
3156/// semantics.
3157enum StructReturnType {
3158 NotStructReturn,
3159 RegStructReturn,
3160 StackStructReturn
3161};
3162static StructReturnType
3163callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
3164 if (Outs.empty())
3165 return NotStructReturn;
3166
3167 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3168 if (!Flags.isSRet())
3169 return NotStructReturn;
3170 if (Flags.isInReg() || IsMCU)
3171 return RegStructReturn;
3172 return StackStructReturn;
3173}
3174
3175/// Determines whether a function uses struct return semantics.
3176static StructReturnType
3177argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
3178 if (Ins.empty())
3179 return NotStructReturn;
3180
3181 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3182 if (!Flags.isSRet())
3183 return NotStructReturn;
3184 if (Flags.isInReg() || IsMCU)
3185 return RegStructReturn;
3186 return StackStructReturn;
3187}
3188
3189/// Make a copy of an aggregate at address specified by "Src" to address
3190/// "Dst" with size and alignment information specified by the specific
3191/// parameter attribute. The copy will be passed as a byval function parameter.
3192static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
3193 SDValue Chain, ISD::ArgFlagsTy Flags,
3194 SelectionDAG &DAG, const SDLoc &dl) {
3195 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
3196
3197 return DAG.getMemcpy(
3198 Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3199 /*isVolatile*/ false, /*AlwaysInline=*/true,
3200 /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3201}
3202
3203/// Return true if the calling convention is one that we can guarantee TCO for.
3204static bool canGuaranteeTCO(CallingConv::ID CC) {
3205 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3206 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
3207 CC == CallingConv::HHVM || CC == CallingConv::Tail);
3208}
3209
3210/// Return true if we might ever do TCO for calls with this calling convention.
3211static bool mayTailCallThisCC(CallingConv::ID CC) {
3212 switch (CC) {
3213 // C calling conventions:
3214 case CallingConv::C:
3215 case CallingConv::Win64:
3216 case CallingConv::X86_64_SysV:
3217 // Callee pop conventions:
3218 case CallingConv::X86_ThisCall:
3219 case CallingConv::X86_StdCall:
3220 case CallingConv::X86_VectorCall:
3221 case CallingConv::X86_FastCall:
3222 // Swift:
3223 case CallingConv::Swift:
3224 return true;
3225 default:
3226 return canGuaranteeTCO(CC);
3227 }
3228}
3229
3230/// Return true if the function is being made into a tailcall target by
3231/// changing its ABI.
3232static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3233 return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
3234}
3235
3236bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3237 if (!CI->isTailCall())
3238 return false;
3239
3240 ImmutableCallSite CS(CI);
3241 CallingConv::ID CalleeCC = CS.getCallingConv();
3242 if (!mayTailCallThisCC(CalleeCC))
3243 return false;
3244
3245 return true;
3246}
3247
3248SDValue
3249X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3250 const SmallVectorImpl<ISD::InputArg> &Ins,
3251 const SDLoc &dl, SelectionDAG &DAG,
3252 const CCValAssign &VA,
3253 MachineFrameInfo &MFI, unsigned i) const {
3254 // Create the nodes corresponding to a load from this parameter slot.
3255 ISD::ArgFlagsTy Flags = Ins[i].Flags;
3256 bool AlwaysUseMutable = shouldGuaranteeTCO(
3257 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3258 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3259 EVT ValVT;
3260 MVT PtrVT = getPointerTy(DAG.getDataLayout());
3261
3262 // If value is passed by pointer we have address passed instead of the value
3263 // itself. No need to extend if the mask value and location share the same
3264 // absolute size.
3265 bool ExtendedInMem =
3266 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
3267 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
3268
3269 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
3270 ValVT = VA.getLocVT();
3271 else
3272 ValVT = VA.getValVT();
3273
3274 // FIXME: For now, all byval parameter objects are marked mutable. This can be
3275 // changed with more analysis.
3276 // In case of tail call optimization mark all arguments mutable. Since they
3277 // could be overwritten by lowering of arguments in case of a tail call.
3278 if (Flags.isByVal()) {
3279 unsigned Bytes = Flags.getByValSize();
3280 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
3281
3282 // FIXME: For now, all byval parameter objects are marked as aliasing. This
3283 // can be improved with deeper analysis.
3284 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
3285 /*isAliased=*/true);
3286 return DAG.getFrameIndex(FI, PtrVT);
3287 }
3288
3289 // This is an argument in memory. We might be able to perform copy elision.
3290 // If the argument is passed directly in memory without any extension, then we
3291 // can perform copy elision. Large vector types, for example, may be passed
3292 // indirectly by pointer.
3293 if (Flags.isCopyElisionCandidate() &&
3294 VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem) {
3295 EVT ArgVT = Ins[i].ArgVT;
3296 SDValue PartAddr;
3297 if (Ins[i].PartOffset == 0) {
3298 // If this is a one-part value or the first part of a multi-part value,
3299 // create a stack object for the entire argument value type and return a
3300 // load from our portion of it. This assumes that if the first part of an
3301 // argument is in memory, the rest will also be in memory.
3302 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
3303 /*IsImmutable=*/false);
3304 PartAddr = DAG.getFrameIndex(FI, PtrVT);
3305 return DAG.getLoad(
3306 ValVT, dl, Chain, PartAddr,
3307 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3308 } else {
3309 // This is not the first piece of an argument in memory. See if there is
3310 // already a fixed stack object including this offset. If so, assume it
3311 // was created by the PartOffset == 0 branch above and create a load from
3312 // the appropriate offset into it.
3313 int64_t PartBegin = VA.getLocMemOffset();
3314 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
3315 int FI = MFI.getObjectIndexBegin();
3316 for (; MFI.isFixedObjectIndex(FI); ++FI) {
3317 int64_t ObjBegin = MFI.getObjectOffset(FI);
3318 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
3319 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
3320 break;
3321 }
3322 if (MFI.isFixedObjectIndex(FI)) {
3323 SDValue Addr =
3324 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
3325 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
3326 return DAG.getLoad(
3327 ValVT, dl, Chain, Addr,
3328 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
3329 Ins[i].PartOffset));
3330 }
3331 }
3332 }
3333
3334 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
3335 VA.getLocMemOffset(), isImmutable);
3336
3337 // Set SExt or ZExt flag.
3338 if (VA.getLocInfo() == CCValAssign::ZExt) {
3339 MFI.setObjectZExt(FI, true);
3340 } else if (VA.getLocInfo() == CCValAssign::SExt) {
3341 MFI.setObjectSExt(FI, true);
3342 }
3343
3344 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
3345 SDValue Val = DAG.getLoad(
3346 ValVT, dl, Chain, FIN,
3347 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3348 return ExtendedInMem
3349 ? (VA.getValVT().isVector()
3350 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
3351 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
3352 : Val;
3353}
3354
3355// FIXME: Get this from tablegen.
3356static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
3357 const X86Subtarget &Subtarget) {
3358 assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3358, __PRETTY_FUNCTION__))
;
3359
3360 if (Subtarget.isCallingConvWin64(CallConv)) {
3361 static const MCPhysReg GPR64ArgRegsWin64[] = {
3362 X86::RCX, X86::RDX, X86::R8, X86::R9
3363 };
3364 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
3365 }
3366
3367 static const MCPhysReg GPR64ArgRegs64Bit[] = {
3368 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
3369 };
3370 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
3371}
3372
3373// FIXME: Get this from tablegen.
3374static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
3375 CallingConv::ID CallConv,
3376 const X86Subtarget &Subtarget) {
3377 assert(Subtarget.is64Bit())((Subtarget.is64Bit()) ? static_cast<void> (0) : __assert_fail
("Subtarget.is64Bit()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3377, __PRETTY_FUNCTION__))
;
3378 if (Subtarget.isCallingConvWin64(CallConv)) {
3379 // The XMM registers which might contain var arg parameters are shadowed
3380 // in their paired GPR. So we only need to save the GPR to their home
3381 // slots.
3382 // TODO: __vectorcall will change this.
3383 return None;
3384 }
3385
3386 const Function &F = MF.getFunction();
3387 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
3388 bool isSoftFloat = Subtarget.useSoftFloat();
3389 assert(!(isSoftFloat && NoImplicitFloatOps) &&((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3390, __PRETTY_FUNCTION__))
3390 "SSE register cannot be used when SSE is disabled!")((!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(isSoftFloat && NoImplicitFloatOps) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3390, __PRETTY_FUNCTION__))
;
3391 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
3392 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
3393 // registers.
3394 return None;
3395
3396 static const MCPhysReg XMMArgRegs64Bit[] = {
3397 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3398 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3399 };
3400 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
3401}
3402
3403#ifndef NDEBUG
3404static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
3405 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
3406 [](const CCValAssign &A, const CCValAssign &B) -> bool {
3407 return A.getValNo() < B.getValNo();
3408 });
3409}
3410#endif
3411
3412SDValue X86TargetLowering::LowerFormalArguments(
3413 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3414 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3415 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3416 MachineFunction &MF = DAG.getMachineFunction();
3417 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3418 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3419
3420 const Function &F = MF.getFunction();
3421 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3422 F.getName() == "main")
3423 FuncInfo->setForceFramePointer(true);
3424
3425 MachineFrameInfo &MFI = MF.getFrameInfo();
3426 bool Is64Bit = Subtarget.is64Bit();
3427 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3428
3429 assert(((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3431, __PRETTY_FUNCTION__))
3430 !(isVarArg && canGuaranteeTCO(CallConv)) &&((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3431, __PRETTY_FUNCTION__))
3431 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3431, __PRETTY_FUNCTION__))
;
3432
3433 // Assign locations to all of the incoming arguments.
3434 SmallVector<CCValAssign, 16> ArgLocs;
3435 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3436
3437 // Allocate shadow area for Win64.
3438 if (IsWin64)
3439 CCInfo.AllocateStack(32, 8);
3440
3441 CCInfo.AnalyzeArguments(Ins, CC_X86);
3442
3443 // In vectorcall calling convention a second pass is required for the HVA
3444 // types.
3445 if (CallingConv::X86_VectorCall == CallConv) {
3446 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3447 }
3448
3449 // The next loop assumes that the locations are in the same order of the
3450 // input arguments.
3451 assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3452, __PRETTY_FUNCTION__))
3452 "Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3452, __PRETTY_FUNCTION__))
;
3453
3454 SDValue ArgValue;
3455 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3456 ++I, ++InsIndex) {
3457 assert(InsIndex < Ins.size() && "Invalid Ins index")((InsIndex < Ins.size() && "Invalid Ins index") ? static_cast
<void> (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3457, __PRETTY_FUNCTION__))
;
3458 CCValAssign &VA = ArgLocs[I];
3459
3460 if (VA.isRegLoc()) {
3461 EVT RegVT = VA.getLocVT();
3462 if (VA.needsCustom()) {
3463 assert(((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3465, __PRETTY_FUNCTION__))
3464 VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3465, __PRETTY_FUNCTION__))
3465 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3465, __PRETTY_FUNCTION__))
;
3466
3467 // v64i1 values, in regcall calling convention, that are
3468 // compiled to 32 bit arch, are split up into two registers.
3469 ArgValue =
3470 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3471 } else {
3472 const TargetRegisterClass *RC;
3473 if (RegVT == MVT::i8)
3474 RC = &X86::GR8RegClass;
3475 else if (RegVT == MVT::i16)
3476 RC = &X86::GR16RegClass;
3477 else if (RegVT == MVT::i32)
3478 RC = &X86::GR32RegClass;
3479 else if (Is64Bit && RegVT == MVT::i64)
3480 RC = &X86::GR64RegClass;
3481 else if (RegVT == MVT::f32)
3482 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3483 else if (RegVT == MVT::f64)
3484 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3485 else if (RegVT == MVT::f80)
3486 RC = &X86::RFP80RegClass;
3487 else if (RegVT == MVT::f128)
3488 RC = &X86::VR128RegClass;
3489 else if (RegVT.is512BitVector())
3490 RC = &X86::VR512RegClass;
3491 else if (RegVT.is256BitVector())
3492 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3493 else if (RegVT.is128BitVector())
3494 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3495 else if (RegVT == MVT::x86mmx)
3496 RC = &X86::VR64RegClass;
3497 else if (RegVT == MVT::v1i1)
3498 RC = &X86::VK1RegClass;
3499 else if (RegVT == MVT::v8i1)
3500 RC = &X86::VK8RegClass;
3501 else if (RegVT == MVT::v16i1)
3502 RC = &X86::VK16RegClass;
3503 else if (RegVT == MVT::v32i1)
3504 RC = &X86::VK32RegClass;
3505 else if (RegVT == MVT::v64i1)
3506 RC = &X86::VK64RegClass;
3507 else
3508 llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3508)
;
3509
3510 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3511 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3512 }
3513
3514 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3515 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3516 // right size.
3517 if (VA.getLocInfo() == CCValAssign::SExt)
3518 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3519 DAG.getValueType(VA.getValVT()));
3520 else if (VA.getLocInfo() == CCValAssign::ZExt)
3521 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3522 DAG.getValueType(VA.getValVT()));
3523 else if (VA.getLocInfo() == CCValAssign::BCvt)
3524 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3525
3526 if (VA.isExtInLoc()) {
3527 // Handle MMX values passed in XMM regs.
3528 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3529 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3530 else if (VA.getValVT().isVector() &&
3531 VA.getValVT().getScalarType() == MVT::i1 &&
3532 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3533 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3534 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3535 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3536 } else
3537 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3538 }
3539 } else {
3540 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3540, __PRETTY_FUNCTION__))
;
3541 ArgValue =
3542 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3543 }
3544
3545 // If value is passed via pointer - do a load.
3546 if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
3547 ArgValue =
3548 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3549
3550 InVals.push_back(ArgValue);
3551 }
3552
3553 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3554 // Swift calling convention does not require we copy the sret argument
3555 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3556 if (CallConv == CallingConv::Swift)
3557 continue;
3558
3559 // All x86 ABIs require that for returning structs by value we copy the
3560 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3561 // the argument into a virtual register so that we can access it from the
3562 // return points.
3563 if (Ins[I].Flags.isSRet()) {
3564 unsigned Reg = FuncInfo->getSRetReturnReg();
3565 if (!Reg) {
3566 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3567 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3568 FuncInfo->setSRetReturnReg(Reg);
3569 }
3570 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3571 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3572 break;
3573 }
3574 }
3575
3576 unsigned StackSize = CCInfo.getNextStackOffset();
3577 // Align stack specially for tail calls.
3578 if (shouldGuaranteeTCO(CallConv,
3579 MF.getTarget().Options.GuaranteedTailCallOpt))
3580 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3581
3582 // If the function takes variable number of arguments, make a frame index for
3583 // the start of the first vararg value... for expansion of llvm.va_start. We
3584 // can skip this if there are no va_start calls.
3585 if (MFI.hasVAStart() &&
3586 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3587 CallConv != CallingConv::X86_ThisCall))) {
3588 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3589 }
3590
3591 // Figure out if XMM registers are in use.
3592 assert(!(Subtarget.useSoftFloat() &&((!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute
::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3594, __PRETTY_FUNCTION__))
3593 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&((!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute
::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3594, __PRETTY_FUNCTION__))
3594 "SSE register cannot be used when SSE is disabled!")((!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute
::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(Subtarget.useSoftFloat() && F.hasFnAttribute(Attribute::NoImplicitFloat)) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3594, __PRETTY_FUNCTION__))
;
3595
3596 // 64-bit calling conventions support varargs and register parameters, so we
3597 // have to do extra work to spill them in the prologue.
3598 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3599 // Find the first unallocated argument registers.
3600 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3601 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3602 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3603 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3604 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3605, __PRETTY_FUNCTION__))
3605 "SSE register cannot be used when SSE is disabled!")((!(NumXMMRegs && !Subtarget.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? static_cast<void> (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3605, __PRETTY_FUNCTION__))
;
3606
3607 // Gather all the live in physical registers.
3608 SmallVector<SDValue, 6> LiveGPRs;
3609 SmallVector<SDValue, 8> LiveXMMRegs;
3610 SDValue ALVal;
3611 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3612 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3613 LiveGPRs.push_back(
3614 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3615 }
3616 if (!ArgXMMs.empty()) {
3617 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3618 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3619 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3620 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3621 LiveXMMRegs.push_back(
3622 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3623 }
3624 }
3625
3626 if (IsWin64) {
3627 // Get to the caller-allocated home save location. Add 8 to account
3628 // for the return address.
3629 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3630 FuncInfo->setRegSaveFrameIndex(
3631 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3632 // Fixup to set vararg frame on shadow area (4 x i64).
3633 if (NumIntRegs < 4)
3634 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3635 } else {
3636 // For X86-64, if there are vararg parameters that are passed via
3637 // registers, then we must store them to their spots on the stack so
3638 // they may be loaded by dereferencing the result of va_next.
3639 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3640 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3641 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3642 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3643 }
3644
3645 // Store the integer parameter registers.
3646 SmallVector<SDValue, 8> MemOps;
3647 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3648 getPointerTy(DAG.getDataLayout()));
3649 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3650 for (SDValue Val : LiveGPRs) {
3651 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3652 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3653 SDValue Store =
3654 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3655 MachinePointerInfo::getFixedStack(
3656 DAG.getMachineFunction(),
3657 FuncInfo->getRegSaveFrameIndex(), Offset));
3658 MemOps.push_back(Store);
3659 Offset += 8;
3660 }
3661
3662 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3663 // Now store the XMM (fp + vector) parameter registers.
3664 SmallVector<SDValue, 12> SaveXMMOps;
3665 SaveXMMOps.push_back(Chain);
3666 SaveXMMOps.push_back(ALVal);
3667 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3668 FuncInfo->getRegSaveFrameIndex(), dl));
3669 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3670 FuncInfo->getVarArgsFPOffset(), dl));
3671 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3672 LiveXMMRegs.end());
3673 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3674 MVT::Other, SaveXMMOps));
3675 }
3676
3677 if (!MemOps.empty())
3678 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3679 }
3680
3681 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3682 // Find the largest legal vector type.
3683 MVT VecVT = MVT::Other;
3684 // FIXME: Only some x86_32 calling conventions support AVX512.
3685 if (Subtarget.useAVX512Regs() &&
3686 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3687 CallConv == CallingConv::Intel_OCL_BI)))
3688 VecVT = MVT::v16f32;
3689 else if (Subtarget.hasAVX())
3690 VecVT = MVT::v8f32;
3691 else if (Subtarget.hasSSE2())
3692 VecVT = MVT::v4f32;
3693
3694 // We forward some GPRs and some vector types.
3695 SmallVector<MVT, 2> RegParmTypes;
3696 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3697 RegParmTypes.push_back(IntVT);
3698 if (VecVT != MVT::Other)
3699 RegParmTypes.push_back(VecVT);
3700
3701 // Compute the set of forwarded registers. The rest are scratch.
3702 SmallVectorImpl<ForwardedRegister> &Forwards =
3703 FuncInfo->getForwardedMustTailRegParms();
3704 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3705
3706 // Forward AL for SysV x86_64 targets, since it is used for varargs.
3707 if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {
3708 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3709 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3710 }
3711
3712 // Copy all forwards from physical to virtual registers.
3713 for (ForwardedRegister &FR : Forwards) {
3714 // FIXME: Can we use a less constrained schedule?
3715 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
3716 FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
3717 Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
3718 }
3719 }
3720
3721 // Some CCs need callee pop.
3722 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3723 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3724 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3725 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3726 // X86 interrupts must pop the error code (and the alignment padding) if
3727 // present.
3728 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3729 } else {
3730 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3731 // If this is an sret function, the return should pop the hidden pointer.
3732 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3733 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3734 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3735 FuncInfo->setBytesToPopOnReturn(4);
3736 }
3737
3738 if (!Is64Bit) {
3739 // RegSaveFrameIndex is X86-64 only.
3740 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3741 if (CallConv == CallingConv::X86_FastCall ||
3742 CallConv == CallingConv::X86_ThisCall)
3743 // fastcc functions can't have varargs.
3744 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3745 }
3746
3747 FuncInfo->setArgumentStackSize(StackSize);
3748
3749 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3750 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3751 if (Personality == EHPersonality::CoreCLR) {
3752 assert(Is64Bit)((Is64Bit) ? static_cast<void> (0) : __assert_fail ("Is64Bit"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3752, __PRETTY_FUNCTION__))
;
3753 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3754 // that we'd prefer this slot be allocated towards the bottom of the frame
3755 // (i.e. near the stack pointer after allocating the frame). Every
3756 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3757 // offset from the bottom of this and each funclet's frame must be the
3758 // same, so the size of funclets' (mostly empty) frames is dictated by
3759 // how far this slot is from the bottom (since they allocate just enough
3760 // space to accommodate holding this slot at the correct offset).
3761 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3762 EHInfo->PSPSymFrameIdx = PSPSymFI;
3763 }
3764 }
3765
3766 if (CallConv == CallingConv::X86_RegCall ||
3767 F.hasFnAttribute("no_caller_saved_registers")) {
3768 MachineRegisterInfo &MRI = MF.getRegInfo();
3769 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3770 MRI.disableCalleeSavedRegister(Pair.first);
3771 }
3772
3773 return Chain;
3774}
3775
3776SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3777 SDValue Arg, const SDLoc &dl,
3778 SelectionDAG &DAG,
3779 const CCValAssign &VA,
3780 ISD::ArgFlagsTy Flags) const {
3781 unsigned LocMemOffset = VA.getLocMemOffset();
3782 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3783 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3784 StackPtr, PtrOff);
3785 if (Flags.isByVal())
3786 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3787
3788 return DAG.getStore(
3789 Chain, dl, Arg, PtrOff,
3790 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3791}
3792
3793/// Emit a load of return address if tail call
3794/// optimization is performed and it is required.
3795SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3796 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3797 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3798 // Adjust the Return address stack slot.
3799 EVT VT = getPointerTy(DAG.getDataLayout());
3800 OutRetAddr = getReturnAddressFrameIndex(DAG);
3801
3802 // Load the "old" Return address.
3803 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3804 return SDValue(OutRetAddr.getNode(), 1);
3805}
3806
3807/// Emit a store of the return address if tail call
3808/// optimization is performed and it is required (FPDiff!=0).
3809static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3810 SDValue Chain, SDValue RetAddrFrIdx,
3811 EVT PtrVT, unsigned SlotSize,
3812 int FPDiff, const SDLoc &dl) {
3813 // Store the return address to the appropriate stack slot.
3814 if (!FPDiff) return Chain;
3815 // Calculate the new stack slot for the return address.
3816 int NewReturnAddrFI =
3817 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3818 false);
3819 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3820 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3821 MachinePointerInfo::getFixedStack(
3822 DAG.getMachineFunction(), NewReturnAddrFI));
3823 return Chain;
3824}
3825
3826/// Returns a vector_shuffle mask for an movs{s|d}, movd
3827/// operation of specified width.
3828static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3829 SDValue V2) {
3830 unsigned NumElems = VT.getVectorNumElements();
3831 SmallVector<int, 8> Mask;
3832 Mask.push_back(NumElems);
3833 for (unsigned i = 1; i != NumElems; ++i)
3834 Mask.push_back(i);
3835 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3836}
3837
3838SDValue
3839X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3840 SmallVectorImpl<SDValue> &InVals) const {
3841 SelectionDAG &DAG = CLI.DAG;
3842 SDLoc &dl = CLI.DL;
3843 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3844 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3845 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3846 SDValue Chain = CLI.Chain;
3847 SDValue Callee = CLI.Callee;
3848 CallingConv::ID CallConv = CLI.CallConv;
3849 bool &isTailCall = CLI.IsTailCall;
3850 bool isVarArg = CLI.IsVarArg;
3851
3852 MachineFunction &MF = DAG.getMachineFunction();
3853 bool Is64Bit = Subtarget.is64Bit();
3854 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3855 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3856 bool IsSibcall = false;
3857 bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
3858 CallConv == CallingConv::Tail;
3859 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3860 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3861 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3862 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3863 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3864 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
3865 bool HasNoCfCheck =
3866 (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
3867 const Module *M = MF.getMMI().getModule();
3868 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3869
3870 MachineFunction::CallSiteInfo CSInfo;
3871
3872 if (CallConv == CallingConv::X86_INTR)
3873 report_fatal_error("X86 interrupts may not be called directly");
3874
3875 if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
3876 // If we are using a GOT, disable tail calls to external symbols with
3877 // default visibility. Tail calling such a symbol requires using a GOT
3878 // relocation, which forces early binding of the symbol. This breaks code
3879 // that require lazy function symbol resolution. Using musttail or
3880 // GuaranteedTailCallOpt will override this.
3881 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3882 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3883 G->getGlobal()->hasDefaultVisibility()))
3884 isTailCall = false;
3885 }
3886
3887 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3888 if (IsMustTail) {
3889 // Force this to be a tail call. The verifier rules are enough to ensure
3890 // that we can lower this successfully without moving the return address
3891 // around.
3892 isTailCall = true;
3893 } else if (isTailCall) {
3894 // Check if it's really possible to do a tail call.
3895 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3896 isVarArg, SR != NotStructReturn,
3897 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3898 Outs, OutVals, Ins, DAG);
3899
3900 // Sibcalls are automatically detected tailcalls which do not require
3901 // ABI changes.
3902 if (!IsGuaranteeTCO && isTailCall)
3903 IsSibcall = true;
3904
3905 if (isTailCall)
3906 ++NumTailCalls;
3907 }
3908
3909 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3910, __PRETTY_FUNCTION__))
3910 "Var args not supported with calling convention fastcc, ghc or hipe")((!(isVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling convention fastcc, ghc or hipe"
) ? static_cast<void> (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3910, __PRETTY_FUNCTION__))
;
3911
3912 // Analyze operands of the call, assigning locations to each operand.
3913 SmallVector<CCValAssign, 16> ArgLocs;
3914 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3915
3916 // Allocate shadow area for Win64.
3917 if (IsWin64)
3918 CCInfo.AllocateStack(32, 8);
3919
3920 CCInfo.AnalyzeArguments(Outs, CC_X86);
3921
3922 // In vectorcall calling convention a second pass is required for the HVA
3923 // types.
3924 if (CallingConv::X86_VectorCall == CallConv) {
3925 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3926 }
3927
3928 // Get a count of how many bytes are to be pushed on the stack.
3929 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3930 if (IsSibcall)
3931 // This is a sibcall. The memory operands are available in caller's
3932 // own caller's stack.
3933 NumBytes = 0;
3934 else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))
3935 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3936
3937 int FPDiff = 0;
3938 if (isTailCall && !IsSibcall && !IsMustTail) {
3939 // Lower arguments at fp - stackoffset + fpdiff.
3940 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3941
3942 FPDiff = NumBytesCallerPushed - NumBytes;
3943
3944 // Set the delta of movement of the returnaddr stackslot.
3945 // But only set if delta is greater than previous delta.
3946 if (FPDiff < X86Info->getTCReturnAddrDelta())
3947 X86Info->setTCReturnAddrDelta(FPDiff);
3948 }
3949
3950 unsigned NumBytesToPush = NumBytes;
3951 unsigned NumBytesToPop = NumBytes;
3952
3953 // If we have an inalloca argument, all stack space has already been allocated
3954 // for us and be right at the top of the stack. We don't support multiple
3955 // arguments passed in memory when using inalloca.
3956 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3957 NumBytesToPush = 0;
3958 if (!ArgLocs.back().isMemLoc())
3959 report_fatal_error("cannot use inalloca attribute on a register "
3960 "parameter");
3961 if (ArgLocs.back().getLocMemOffset() != 0)
3962 report_fatal_error("any parameter with the inalloca attribute must be "
3963 "the only memory argument");
3964 }
3965
3966 if (!IsSibcall && !IsMustTail)
3967 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3968 NumBytes - NumBytesToPush, dl);
3969
3970 SDValue RetAddrFrIdx;
3971 // Load return address for tail calls.
3972 if (isTailCall && FPDiff)
3973 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3974 Is64Bit, FPDiff, dl);
3975
3976 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3977 SmallVector<SDValue, 8> MemOpChains;
3978 SDValue StackPtr;
3979
3980 // The next loop assumes that the locations are in the same order of the
3981 // input arguments.
3982 assert(isSortedByValueNo(ArgLocs) &&((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3983, __PRETTY_FUNCTION__))
3983 "Argument Location list must be sorted before lowering")((isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering"
) ? static_cast<void> (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3983, __PRETTY_FUNCTION__))
;
3984
3985 // Walk the register/memloc assignments, inserting copies/loads. In the case
3986 // of tail call optimization arguments are handle later.
3987 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3988 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3989 ++I, ++OutIndex) {
3990 assert(OutIndex < Outs.size() && "Invalid Out index")((OutIndex < Outs.size() && "Invalid Out index") ?
static_cast<void> (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3990, __PRETTY_FUNCTION__))
;
3991 // Skip inalloca arguments, they have already been written.
3992 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3993 if (Flags.isInAlloca())
3994 continue;
3995
3996 CCValAssign &VA = ArgLocs[I];
3997 EVT RegVT = VA.getLocVT();
3998 SDValue Arg = OutVals[OutIndex];
3999 bool isByVal = Flags.isByVal();
4000
4001 // Promote the value if needed.
4002 switch (VA.getLocInfo()) {
4003 default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4003)
;
4004 case CCValAssign::Full: break;
4005 case CCValAssign::SExt:
4006 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
4007 break;
4008 case CCValAssign::ZExt:
4009 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
4010 break;
4011 case CCValAssign::AExt:
4012 if (Arg.getValueType().isVector() &&
4013 Arg.getValueType().getVectorElementType() == MVT::i1)
4014 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
4015 else if (RegVT.is128BitVector()) {
4016 // Special case: passing MMX values in XMM registers.
4017 Arg = DAG.getBitcast(MVT::i64, Arg);
4018 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
4019 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
4020 } else
4021 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
4022 break;
4023 case CCValAssign::BCvt:
4024 Arg = DAG.getBitcast(RegVT, Arg);
4025 break;
4026 case CCValAssign::Indirect: {
4027 if (isByVal) {
4028 // Memcpy the argument to a temporary stack slot to prevent
4029 // the caller from seeing any modifications the callee may make
4030 // as guaranteed by the `byval` attribute.
4031 int FrameIdx = MF.getFrameInfo().CreateStackObject(
4032 Flags.getByValSize(),
4033 std::max(Align(16), Flags.getNonZeroByValAlign()), false);
4034 SDValue StackSlot =
4035 DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
4036 Chain =
4037 CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
4038 // From now on treat this as a regular pointer
4039 Arg = StackSlot;
4040 isByVal = false;
4041 } else {
4042 // Store the argument.
4043 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
4044 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
4045 Chain = DAG.getStore(
4046 Chain, dl, Arg, SpillSlot,
4047 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
4048 Arg = SpillSlot;
4049 }
4050 break;
4051 }
4052 }
4053
4054 if (VA.needsCustom()) {
4055 assert(VA.getValVT() == MVT::v64i1 &&((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4056, __PRETTY_FUNCTION__))
4056 "Currently the only custom case is when we split v64i1 to 2 regs")((VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"
) ? static_cast<void> (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4056, __PRETTY_FUNCTION__))
;
4057 // Split v64i1 value into two registers
4058 Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);
4059 } else if (VA.isRegLoc()) {
4060 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
4061 const TargetOptions &Options = DAG.getTarget().Options;
4062 if (Options.EnableDebugEntryValues)
4063 CSInfo.emplace_back(VA.getLocReg(), I);
4064 if (isVarArg && IsWin64) {
4065 // Win64 ABI requires argument XMM reg to be copied to the corresponding
4066 // shadow reg if callee is a varargs function.
4067 unsigned ShadowReg = 0;
4068 switch (VA.getLocReg()) {
4069 case X86::XMM0: ShadowReg = X86::RCX; break;
4070 case X86::XMM1: ShadowReg = X86::RDX; break;
4071 case X86::XMM2: ShadowReg = X86::R8; break;
4072 case X86::XMM3: ShadowReg = X86::R9; break;
4073 }
4074 if (ShadowReg)
4075 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
4076 }
4077 } else if (!IsSibcall && (!isTailCall || isByVal)) {
4078 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4078, __PRETTY_FUNCTION__))
;
4079 if (!StackPtr.getNode())
4080 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4081 getPointerTy(DAG.getDataLayout()));
4082 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
4083 dl, DAG, VA, Flags));
4084 }
4085 }
4086
4087 if (!MemOpChains.empty())
4088 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
4089
4090 if (Subtarget.isPICStyleGOT()) {
4091 // ELF / PIC requires GOT in the EBX register before function calls via PLT
4092 // GOT pointer.
4093 if (!isTailCall) {
4094 RegsToPass.push_back(std::make_pair(
4095 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
4096 getPointerTy(DAG.getDataLayout()))));
4097 } else {
4098 // If we are tail calling and generating PIC/GOT style code load the
4099 // address of the callee into ECX. The value in ecx is used as target of
4100 // the tail jump. This is done to circumvent the ebx/callee-saved problem
4101 // for tail calls on PIC/GOT architectures. Normally we would just put the
4102 // address of GOT into ebx and then call target@PLT. But for tail calls
4103 // ebx would be restored (since ebx is callee saved) before jumping to the
4104 // target@PLT.
4105
4106 // Note: The actual moving to ECX is done further down.
4107 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
4108 if (G && !G->getGlobal()->hasLocalLinkage() &&
4109 G->getGlobal()->hasDefaultVisibility())
4110 Callee = LowerGlobalAddress(Callee, DAG);
4111 else if (isa<ExternalSymbolSDNode>(Callee))
4112 Callee = LowerExternalSymbol(Callee, DAG);
4113 }
4114 }
4115
4116 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
4117 // From AMD64 ABI document:
4118 // For calls that may call functions that use varargs or stdargs
4119 // (prototype-less calls or calls to functions containing ellipsis (...) in
4120 // the declaration) %al is used as hidden argument to specify the number
4121 // of SSE registers used. The contents of %al do not need to match exactly
4122 // the number of registers, but must be an ubound on the number of SSE
4123 // registers used and is in the range 0 - 8 inclusive.
4124
4125 // Count the number of XMM registers allocated.
4126 static const MCPhysReg XMMArgRegs[] = {
4127 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
4128 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
4129 };
4130 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
4131 assert((Subtarget.hasSSE1() || !NumXMMRegs)(((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4132, __PRETTY_FUNCTION__))
4132 && "SSE registers cannot be used when SSE is disabled")(((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4132, __PRETTY_FUNCTION__))
;
4133
4134 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
4135 DAG.getConstant(NumXMMRegs, dl,
4136 MVT::i8)));
4137 }
4138
4139 if (isVarArg && IsMustTail) {
4140 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
4141 for (const auto &F : Forwards) {
4142 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
4143 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
4144 }
4145 }
4146
4147 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
4148 // don't need this because the eligibility check rejects calls that require
4149 // shuffling arguments passed in memory.
4150 if (!IsSibcall && isTailCall) {
4151 // Force all the incoming stack arguments to be loaded from the stack
4152 // before any new outgoing arguments are stored to the stack, because the
4153 // outgoing stack slots may alias the incoming argument stack slots, and
4154 // the alias isn't otherwise explicit. This is slightly more conservative
4155 // than necessary, because it means that each store effectively depends
4156 // on every argument instead of just those arguments it would clobber.
4157 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
4158
4159 SmallVector<SDValue, 8> MemOpChains2;
4160 SDValue FIN;
4161 int FI = 0;
4162 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
4163 ++I, ++OutsIndex) {
4164 CCValAssign &VA = ArgLocs[I];
4165
4166 if (VA.isRegLoc()) {
4167 if (VA.needsCustom()) {
4168 assert((CallConv == CallingConv::X86_RegCall) &&(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"
) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4169, __PRETTY_FUNCTION__))
4169 "Expecting custom case only in regcall calling convention")(((CallConv == CallingConv::X86_RegCall) && "Expecting custom case only in regcall calling convention"
) ? static_cast<void> (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4169, __PRETTY_FUNCTION__))
;
4170 // This means that we are in special case where one argument was
4171 // passed through two register locations - Skip the next location
4172 ++I;
4173 }
4174
4175 continue;
4176 }
4177
4178 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4178, __PRETTY_FUNCTION__))
;
4179 SDValue Arg = OutVals[OutsIndex];
4180 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
4181 // Skip inalloca arguments. They don't require any work.
4182 if (Flags.isInAlloca())
4183 continue;
4184 // Create frame index.
4185 int32_t Offset = VA.getLocMemOffset()+FPDiff;
4186 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
4187 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
4188 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4189
4190 if (Flags.isByVal()) {
4191 // Copy relative to framepointer.
4192 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
4193 if (!StackPtr.getNode())
4194 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
4195 getPointerTy(DAG.getDataLayout()));
4196 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
4197 StackPtr, Source);
4198
4199 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
4200 ArgChain,
4201 Flags, DAG, dl));
4202 } else {
4203 // Store relative to framepointer.
4204 MemOpChains2.push_back(DAG.getStore(
4205 ArgChain, dl, Arg, FIN,
4206 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
4207 }
4208 }
4209
4210 if (!MemOpChains2.empty())
4211 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
4212
4213 // Store the return address to the appropriate stack slot.
4214 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
4215 getPointerTy(DAG.getDataLayout()),
4216 RegInfo->getSlotSize(), FPDiff, dl);
4217 }
4218
4219 // Build a sequence of copy-to-reg nodes chained together with token chain
4220 // and flag operands which copy the outgoing args into registers.
4221 SDValue InFlag;
4222 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
4223 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
4224 RegsToPass[i].second, InFlag);
4225 InFlag = Chain.getValue(1);
4226 }
4227
4228 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
4229 assert(Is64Bit && "Large code model is only legal in 64-bit mode.")((Is64Bit && "Large code model is only legal in 64-bit mode."
) ? static_cast<void> (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4229, __PRETTY_FUNCTION__))
;
4230 // In the 64-bit large code model, we have to make all calls
4231 // through a register, since the call instruction's 32-bit
4232 // pc-relative offset may not be large enough to hold the whole
4233 // address.
4234 } else if (Callee->getOpcode() == ISD::GlobalAddress ||
4235 Callee->getOpcode() == ISD::ExternalSymbol) {
4236 // Lower direct calls to global addresses and external symbols. Setting
4237 // ForCall to true here has the effect of removing WrapperRIP when possible
4238 // to allow direct calls to be selected without first materializing the
4239 // address into a register.
4240 Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);
4241 } else if (Subtarget.isTarget64BitILP32() &&
4242 Callee->getValueType(0) == MVT::i32) {
4243 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
4244 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
4245 }
4246
4247 // Returns a chain & a flag for retval copy to use.
4248 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
4249 SmallVector<SDValue, 8> Ops;
4250
4251 if (!IsSibcall && isTailCall && !IsMustTail) {
4252 Chain = DAG.getCALLSEQ_END(Chain,
4253 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4254 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
4255 InFlag = Chain.getValue(1);
4256 }
4257
4258 Ops.push_back(Chain);
4259 Ops.push_back(Callee);
4260
4261 if (isTailCall)
4262 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
4263
4264 // Add argument registers to the end of the list so that they are known live
4265 // into the call.
4266 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
4267 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
4268 RegsToPass[i].second.getValueType()));
4269
4270 // Add a register mask operand representing the call-preserved registers.
4271 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
4272 // set X86_INTR calling convention because it has the same CSR mask
4273 // (same preserved registers).
4274 const uint32_t *Mask = RegInfo->getCallPreservedMask(
4275 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
4276 assert(Mask && "Missing call preserved mask for calling convention")((Mask && "Missing call preserved mask for calling convention"
) ? static_cast<void> (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4276, __PRETTY_FUNCTION__))
;
4277
4278 // If this is an invoke in a 32-bit function using a funclet-based
4279 // personality, assume the function clobbers all registers. If an exception
4280 // is thrown, the runtime will not restore CSRs.
4281 // FIXME: Model this more precisely so that we can register allocate across
4282 // the normal edge and spill and fill across the exceptional edge.
4283 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
4284 const Function &CallerFn = MF.getFunction();
4285 EHPersonality Pers =
4286 CallerFn.hasPersonalityFn()
4287 ? classifyEHPersonality(CallerFn.getPersonalityFn())
4288 : EHPersonality::Unknown;
4289 if (isFuncletEHPersonality(Pers))
4290 Mask = RegInfo->getNoPreservedMask();
4291 }
4292
4293 // Define a new register mask from the existing mask.
4294 uint32_t *RegMask = nullptr;
4295
4296 // In some calling conventions we need to remove the used physical registers
4297 // from the reg mask.
4298 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
4299 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
4300
4301 // Allocate a new Reg Mask and copy Mask.
4302 RegMask = MF.allocateRegMask();
4303 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
4304 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
4305
4306 // Make sure all sub registers of the argument registers are reset
4307 // in the RegMask.
4308 for (auto const &RegPair : RegsToPass)
4309 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
4310 SubRegs.isValid(); ++SubRegs)
4311 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
4312
4313 // Create the RegMask Operand according to our updated mask.
4314 Ops.push_back(DAG.getRegisterMask(RegMask));
4315 } else {
4316 // Create the RegMask Operand according to the static mask.
4317 Ops.push_back(DAG.getRegisterMask(Mask));
4318 }
4319
4320 if (InFlag.getNode())
4321 Ops.push_back(InFlag);
4322
4323 if (isTailCall) {
4324 // We used to do:
4325 //// If this is the first return lowered for this function, add the regs
4326 //// to the liveout set for the function.
4327 // This isn't right, although it's probably harmless on x86; liveouts
4328 // should be computed from returns not tail calls. Consider a void
4329 // function making a tail call to a function returning int.
4330 MF.getFrameInfo().setHasTailCall();
4331 SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
4332 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
4333 return Ret;
4334 }
4335
4336 if (HasNoCfCheck && IsCFProtectionSupported) {
4337 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
4338 } else {
4339 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
4340 }
4341 InFlag = Chain.getValue(1);
4342 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
4343
4344 // Save heapallocsite metadata.
4345 if (CLI.CS)
4346 if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
4347 DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
4348
4349 // Create the CALLSEQ_END node.
4350 unsigned NumBytesForCalleeToPop;
4351 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
4352 DAG.getTarget().Options.GuaranteedTailCallOpt))
4353 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
4354 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
4355 !Subtarget.getTargetTriple().isOSMSVCRT() &&
4356 SR == StackStructReturn)
4357 // If this is a call to a struct-return function, the callee
4358 // pops the hidden struct pointer, so we have to push it back.
4359 // This is common for Darwin/X86, Linux & Mingw32 targets.
4360 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
4361 NumBytesForCalleeToPop = 4;
4362 else
4363 NumBytesForCalleeToPop = 0; // Callee pops nothing.
4364
4365 // Returns a flag for retval copy to use.
4366 if (!IsSibcall) {
4367 Chain = DAG.getCALLSEQ_END(Chain,
4368 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
4369 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
4370 true),
4371 InFlag, dl);
4372 InFlag = Chain.getValue(1);
4373 }
4374
4375 // Handle result values, copying them out of physregs into vregs that we
4376 // return.
4377 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
4378 InVals, RegMask);
4379}
4380
4381//===----------------------------------------------------------------------===//
4382// Fast Calling Convention (tail call) implementation
4383//===----------------------------------------------------------------------===//
4384
4385// Like std call, callee cleans arguments, convention except that ECX is
4386// reserved for storing the tail called function address. Only 2 registers are
4387// free for argument passing (inreg). Tail call optimization is performed
4388// provided:
4389// * tailcallopt is enabled
4390// * caller/callee are fastcc
4391// On X86_64 architecture with GOT-style position independent code only local
4392// (within module) calls are supported at the moment.
4393// To keep the stack aligned according to platform abi the function
4394// GetAlignedArgumentStackSize ensures that argument delta is always multiples
4395// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
4396// If a tail called function callee has more arguments than the caller the
4397// caller needs to make sure that there is room to move the RETADDR to. This is
4398// achieved by reserving an area the size of the argument delta right after the
4399// original RETADDR, but before the saved framepointer or the spilled registers
4400// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4401// stack layout:
4402// arg1
4403// arg2
4404// RETADDR
4405// [ new RETADDR
4406// move area ]
4407// (possible EBP)
4408// ESI
4409// EDI
4410// local1 ..
4411
4412/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4413/// requirement.
4414unsigned
4415X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
4416 SelectionDAG &DAG) const {
4417 const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
4418 const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
4419 assert(StackSize % SlotSize == 0 &&((StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"
) ? static_cast<void> (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4420, __PRETTY_FUNCTION__))
4420 "StackSize must be a multiple of SlotSize")((StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"
) ? static_cast<void> (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4420, __PRETTY_FUNCTION__))
;
4421 return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
4422}
4423
4424/// Return true if the given stack call argument is already available in the
4425/// same position (relatively) of the caller's incoming argument stack.
4426static
4427bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4428 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4429 const X86InstrInfo *TII, const CCValAssign &VA) {
4430 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4431
4432 for (;;) {
4433 // Look through nodes that don't alter the bits of the incoming value.
4434 unsigned Op = Arg.getOpcode();
4435 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4436 Arg = Arg.getOperand(0);
4437 continue;
4438 }
4439 if (Op == ISD::TRUNCATE) {
4440 const SDValue &TruncInput = Arg.getOperand(0);
4441 if (TruncInput.getOpcode() == ISD::AssertZext &&
4442 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4443 Arg.getValueType()) {
4444 Arg = TruncInput.getOperand(0);
4445 continue;
4446 }
4447 }
4448 break;
4449 }
4450
4451 int FI = INT_MAX2147483647;
4452 if (Arg.getOpcode() == ISD::CopyFromReg) {
4453 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4454 if (!Register::isVirtualRegister(VR))
4455 return false;
4456 MachineInstr *Def = MRI->getVRegDef(VR);
4457 if (!Def)
4458 return false;
4459 if (!Flags.isByVal()) {
4460 if (!TII->isLoadFromStackSlot(*Def, FI))
4461 return false;
4462 } else {
4463 unsigned Opcode = Def->getOpcode();
4464 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4465 Opcode == X86::LEA64_32r) &&
4466 Def->getOperand(1).isFI()) {
4467 FI = Def->getOperand(1).getIndex();
4468 Bytes = Flags.getByValSize();
4469 } else
4470 return false;
4471 }
4472 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4473 if (Flags.isByVal())
4474 // ByVal argument is passed in as a pointer but it's now being
4475 // dereferenced. e.g.
4476 // define @foo(%struct.X* %A) {
4477 // tail call @bar(%struct.X* byval %A)
4478 // }
4479 return false;
4480 SDValue Ptr = Ld->getBasePtr();
4481 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4482 if (!FINode)
4483 return false;
4484 FI = FINode->getIndex();
4485 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4486 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4487 FI = FINode->getIndex();
4488 Bytes = Flags.getByValSize();
4489 } else
4490 return false;
4491
4492 assert(FI != INT_MAX)((FI != 2147483647) ? static_cast<void> (0) : __assert_fail
("FI != INT_MAX", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4492, __PRETTY_FUNCTION__))
;
4493 if (!MFI.isFixedObjectIndex(FI))
4494 return false;
4495
4496 if (Offset != MFI.getObjectOffset(FI))
4497 return false;
4498
4499 // If this is not byval, check that the argument stack object is immutable.
4500 // inalloca and argument copy elision can create mutable argument stack
4501 // objects. Byval objects can be mutated, but a byval call intends to pass the
4502 // mutated memory.
4503 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4504 return false;
4505
4506 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4507 // If the argument location is wider than the argument type, check that any
4508 // extension flags match.
4509 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4510 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4511 return false;
4512 }
4513 }
4514
4515 return Bytes == MFI.getObjectSize(FI);
4516}
4517
4518/// Check whether the call is eligible for tail call optimization. Targets
4519/// that want to do tail call optimization should implement this function.
4520bool X86TargetLowering::IsEligibleForTailCallOptimization(
4521 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4522 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4523 const SmallVectorImpl<ISD::OutputArg> &Outs,
4524 const SmallVectorImpl<SDValue> &OutVals,
4525 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4526 if (!mayTailCallThisCC(CalleeCC))
4527 return false;
4528
4529 // If -tailcallopt is specified, make fastcc functions tail-callable.
4530 MachineFunction &MF = DAG.getMachineFunction();
4531 const Function &CallerF = MF.getFunction();
4532
4533 // If the function return type is x86_fp80 and the callee return type is not,
4534 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4535 // perform a tailcall optimization here.
4536 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4537 return false;
4538
4539 CallingConv::ID CallerCC = CallerF.getCallingConv();
4540 bool CCMatch = CallerCC == CalleeCC;
4541 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4542 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4543 bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||
4544 CalleeCC == CallingConv::Tail;
4545
4546 // Win64 functions have extra shadow space for argument homing. Don't do the
4547 // sibcall if the caller and callee have mismatched expectations for this
4548 // space.
4549 if (IsCalleeWin64 != IsCallerWin64)
4550 return false;
4551
4552 if (IsGuaranteeTCO) {
4553 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4554 return true;
4555 return false;
4556 }
4557
4558 // Look for obvious safe cases to perform tail call optimization that do not
4559 // require ABI changes. This is what gcc calls sibcall.
4560
4561 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4562 // emit a special epilogue.
4563 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4564 if (RegInfo->needsStackRealignment(MF))
4565 return false;
4566
4567 // Also avoid sibcall optimization if either caller or callee uses struct
4568 // return semantics.
4569 if (isCalleeStructRet || isCallerStructRet)
4570 return false;
4571
4572 // Do not sibcall optimize vararg calls unless all arguments are passed via
4573 // registers.
4574 LLVMContext &C = *DAG.getContext();
4575 if (isVarArg && !Outs.empty()) {
4576 // Optimizing for varargs on Win64 is unlikely to be safe without
4577 // additional testing.
4578 if (IsCalleeWin64 || IsCallerWin64)
4579 return false;
4580
4581 SmallVector<CCValAssign, 16> ArgLocs;
4582 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4583
4584 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4585 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4586 if (!ArgLocs[i].isRegLoc())
4587 return false;
4588 }
4589
4590 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4591 // stack. Therefore, if it's not used by the call it is not safe to optimize
4592 // this into a sibcall.
4593 bool Unused = false;
4594 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4595 if (!Ins[i].Used) {
4596 Unused = true;
4597 break;
4598 }
4599 }
4600 if (Unused) {
4601 SmallVector<CCValAssign, 16> RVLocs;
4602 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4603 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4604 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4605 CCValAssign &VA = RVLocs[i];
4606 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4607 return false;
4608 }
4609 }
4610
4611 // Check that the call results are passed in the same way.
4612 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4613 RetCC_X86, RetCC_X86))
4614 return false;
4615 // The callee has to preserve all registers the caller needs to preserve.
4616 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4617 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4618 if (!CCMatch) {
4619 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4620 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4621 return false;
4622 }
4623
4624 unsigned StackArgsSize = 0;
4625
4626 // If the callee takes no arguments then go on to check the results of the
4627 // call.
4628 if (!Outs.empty()) {
4629 // Check if stack adjustment is needed. For now, do not do this if any
4630 // argument is passed on the stack.
4631 SmallVector<CCValAssign, 16> ArgLocs;
4632 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4633
4634 // Allocate shadow area for Win64
4635 if (IsCalleeWin64)
4636 CCInfo.AllocateStack(32, 8);
4637
4638 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4639 StackArgsSize = CCInfo.getNextStackOffset();
4640
4641 if (CCInfo.getNextStackOffset()) {
4642 // Check if the arguments are already laid out in the right way as
4643 // the caller's fixed stack objects.
4644 MachineFrameInfo &MFI = MF.getFrameInfo();
4645 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4646 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4647 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4648 CCValAssign &VA = ArgLocs[i];
4649 SDValue Arg = OutVals[i];
4650 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4651 if (VA.getLocInfo() == CCValAssign::Indirect)
4652 return false;
4653 if (!VA.isRegLoc()) {
4654 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4655 MFI, MRI, TII, VA))
4656 return false;
4657 }
4658 }
4659 }
4660
4661 bool PositionIndependent = isPositionIndependent();
4662 // If the tailcall address may be in a register, then make sure it's
4663 // possible to register allocate for it. In 32-bit, the call address can
4664 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4665 // callee-saved registers are restored. These happen to be the same
4666 // registers used to pass 'inreg' arguments so watch out for those.
4667 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4668 !isa<ExternalSymbolSDNode>(Callee)) ||
4669 PositionIndependent)) {
4670 unsigned NumInRegs = 0;
4671 // In PIC we need an extra register to formulate the address computation
4672 // for the callee.
4673 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4674
4675 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4676 CCValAssign &VA = ArgLocs[i];
4677 if (!VA.isRegLoc())
4678 continue;
4679 Register Reg = VA.getLocReg();
4680 switch (Reg) {
4681 default: break;
4682 case X86::EAX: case X86::EDX: case X86::ECX:
4683 if (++NumInRegs == MaxInRegs)
4684 return false;
4685 break;
4686 }
4687 }
4688 }
4689
4690 const MachineRegisterInfo &MRI = MF.getRegInfo();
4691 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4692 return false;
4693 }
4694
4695 bool CalleeWillPop =
4696 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4697 MF.getTarget().Options.GuaranteedTailCallOpt);
4698
4699 if (unsigned BytesToPop =
4700 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4701 // If we have bytes to pop, the callee must pop them.
4702 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4703 if (!CalleePopMatches)
4704 return false;
4705 } else if (CalleeWillPop && StackArgsSize > 0) {
4706 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4707 return false;
4708 }
4709
4710 return true;
4711}
4712
4713FastISel *
4714X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4715 const TargetLibraryInfo *libInfo) const {
4716 return X86::createFastISel(funcInfo, libInfo);
4717}
4718
4719//===----------------------------------------------------------------------===//
4720// Other Lowering Hooks
4721//===----------------------------------------------------------------------===//
4722
4723static bool MayFoldLoad(SDValue Op) {
4724 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4725}
4726
4727static bool MayFoldIntoStore(SDValue Op) {
4728 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4729}
4730
4731static bool MayFoldIntoZeroExtend(SDValue Op) {
4732 if (Op.hasOneUse()) {
4733 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4734 return (ISD::ZERO_EXTEND == Opcode);
4735 }
4736 return false;
4737}
4738
4739static bool isTargetShuffle(unsigned Opcode) {
4740 switch(Opcode) {
4741 default: return false;
4742 case X86ISD::BLENDI:
4743 case X86ISD::PSHUFB:
4744 case X86ISD::PSHUFD:
4745 case X86ISD::PSHUFHW:
4746 case X86ISD::PSHUFLW:
4747 case X86ISD::SHUFP:
4748 case X86ISD::INSERTPS:
4749 case X86ISD::EXTRQI:
4750 case X86ISD::INSERTQI:
4751 case X86ISD::PALIGNR:
4752 case X86ISD::VSHLDQ:
4753 case X86ISD::VSRLDQ:
4754 case X86ISD::MOVLHPS:
4755 case X86ISD::MOVHLPS:
4756 case X86ISD::MOVSHDUP:
4757 case X86ISD::MOVSLDUP:
4758 case X86ISD::MOVDDUP:
4759 case X86ISD::MOVSS:
4760 case X86ISD::MOVSD:
4761 case X86ISD::UNPCKL:
4762 case X86ISD::UNPCKH:
4763 case X86ISD::VBROADCAST:
4764 case X86ISD::VPERMILPI:
4765 case X86ISD::VPERMILPV:
4766 case X86ISD::VPERM2X128:
4767 case X86ISD::SHUF128:
4768 case X86ISD::VPERMIL2:
4769 case X86ISD::VPERMI:
4770 case X86ISD::VPPERM:
4771 case X86ISD::VPERMV:
4772 case X86ISD::VPERMV3:
4773 case X86ISD::VZEXT_MOVL:
4774 return true;
4775 }
4776}
4777
4778static bool isTargetShuffleVariableMask(unsigned Opcode) {
4779 switch (Opcode) {
4780 default: return false;
4781 // Target Shuffles.
4782 case X86ISD::PSHUFB:
4783 case X86ISD::VPERMILPV:
4784 case X86ISD::VPERMIL2:
4785 case X86ISD::VPPERM:
4786 case X86ISD::VPERMV:
4787 case X86ISD::VPERMV3:
4788 return true;
4789 // 'Faux' Target Shuffles.
4790 case ISD::OR:
4791 case ISD::AND:
4792 case X86ISD::ANDNP:
4793 return true;
4794 }
4795}
4796
4797SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4798 MachineFunction &MF = DAG.getMachineFunction();
4799 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4800 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4801 int ReturnAddrIndex = FuncInfo->getRAIndex();
4802
4803 if (ReturnAddrIndex == 0) {
4804 // Set up a frame object for the return address.
4805 unsigned SlotSize = RegInfo->getSlotSize();
4806 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4807 -(int64_t)SlotSize,
4808 false);
4809 FuncInfo->setRAIndex(ReturnAddrIndex);
4810 }
4811
4812 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4813}
4814
4815bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4816 bool hasSymbolicDisplacement) {
4817 // Offset should fit into 32 bit immediate field.
4818 if (!isInt<32>(Offset))
4819 return false;
4820
4821 // If we don't have a symbolic displacement - we don't have any extra
4822 // restrictions.
4823 if (!hasSymbolicDisplacement)
4824 return true;
4825
4826 // FIXME: Some tweaks might be needed for medium code model.
4827 if (M != CodeModel::Small && M != CodeModel::Kernel)
4828 return false;
4829
4830 // For small code model we assume that latest object is 16MB before end of 31
4831 // bits boundary. We may also accept pretty large negative constants knowing
4832 // that all objects are in the positive half of address space.
4833 if (M == CodeModel::Small && Offset < 16*1024*1024)
4834 return true;
4835
4836 // For kernel code model we know that all object resist in the negative half
4837 // of 32bits address space. We may not accept negative offsets, since they may
4838 // be just off and we may accept pretty large positive ones.
4839 if (M == CodeModel::Kernel && Offset >= 0)
4840 return true;
4841
4842 return false;
4843}
4844
4845/// Determines whether the callee is required to pop its own arguments.
4846/// Callee pop is necessary to support tail calls.
4847bool X86::isCalleePop(CallingConv::ID CallingConv,
4848 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4849 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4850 // can guarantee TCO.
4851 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4852 return true;
4853
4854 switch (CallingConv) {
4855 default:
4856 return false;
4857 case CallingConv::X86_StdCall:
4858 case CallingConv::X86_FastCall:
4859 case CallingConv::X86_ThisCall:
4860 case CallingConv::X86_VectorCall:
4861 return !is64Bit;
4862 }
4863}
4864
4865/// Return true if the condition is an signed comparison operation.
4866static bool isX86CCSigned(unsigned X86CC) {
4867 switch (X86CC) {
4868 default:
4869 llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4869)
;
4870 case X86::COND_E:
4871 case X86::COND_NE:
4872 case X86::COND_B:
4873 case X86::COND_A:
4874 case X86::COND_BE:
4875 case X86::COND_AE:
4876 return false;
4877 case X86::COND_G:
4878 case X86::COND_GE:
4879 case X86::COND_L:
4880 case X86::COND_LE:
4881 return true;
4882 }
4883}
4884
4885static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4886 switch (SetCCOpcode) {
4887 default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4887)
;
4888 case ISD::SETEQ: return X86::COND_E;
4889 case ISD::SETGT: return X86::COND_G;
4890 case ISD::SETGE: return X86::COND_GE;
4891 case ISD::SETLT: return X86::COND_L;
4892 case ISD::SETLE: return X86::COND_LE;
4893 case ISD::SETNE: return X86::COND_NE;
4894 case ISD::SETULT: return X86::COND_B;
4895 case ISD::SETUGT: return X86::COND_A;
4896 case ISD::SETULE: return X86::COND_BE;
4897 case ISD::SETUGE: return X86::COND_AE;
4898 }
4899}
4900
4901/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4902/// condition code, returning the condition code and the LHS/RHS of the
4903/// comparison to make.
4904static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4905 bool isFP, SDValue &LHS, SDValue &RHS,
4906 SelectionDAG &DAG) {
4907 if (!isFP) {
4908 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4909 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4910 // X > -1 -> X == 0, jump !sign.
4911 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4912 return X86::COND_NS;
4913 }
4914 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4915 // X < 0 -> X == 0, jump on sign.
4916 return X86::COND_S;
4917 }
4918 if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) {
4919 // X >= 0 -> X == 0, jump on !sign.
4920 return X86::COND_NS;
4921 }
4922 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
4923 // X < 1 -> X <= 0
4924 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4925 return X86::COND_LE;
4926 }
4927 }
4928
4929 return TranslateIntegerX86CC(SetCCOpcode);
4930 }
4931
4932 // First determine if it is required or is profitable to flip the operands.
4933
4934 // If LHS is a foldable load, but RHS is not, flip the condition.
4935 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4936 !ISD::isNON_EXTLoad(RHS.getNode())) {
4937 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4938 std::swap(LHS, RHS);
4939 }
4940
4941 switch (SetCCOpcode) {
4942 default: break;
4943 case ISD::SETOLT:
4944 case ISD::SETOLE:
4945 case ISD::SETUGT:
4946 case ISD::SETUGE:
4947 std::swap(LHS, RHS);
4948 break;
4949 }
4950
4951 // On a floating point condition, the flags are set as follows:
4952 // ZF PF CF op
4953 // 0 | 0 | 0 | X > Y
4954 // 0 | 0 | 1 | X < Y
4955 // 1 | 0 | 0 | X == Y
4956 // 1 | 1 | 1 | unordered
4957 switch (SetCCOpcode) {
4958 default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4958)
;
4959 case ISD::SETUEQ:
4960 case ISD::SETEQ: return X86::COND_E;
4961 case ISD::SETOLT: // flipped
4962 case ISD::SETOGT:
4963 case ISD::SETGT: return X86::COND_A;
4964 case ISD::SETOLE: // flipped
4965 case ISD::SETOGE:
4966 case ISD::SETGE: return X86::COND_AE;
4967 case ISD::SETUGT: // flipped
4968 case ISD::SETULT:
4969 case ISD::SETLT: return X86::COND_B;
4970 case ISD::SETUGE: // flipped
4971 case ISD::SETULE:
4972 case ISD::SETLE: return X86::COND_BE;
4973 case ISD::SETONE:
4974 case ISD::SETNE: return X86::COND_NE;
4975 case ISD::SETUO: return X86::COND_P;
4976 case ISD::SETO: return X86::COND_NP;
4977 case ISD::SETOEQ:
4978 case ISD::SETUNE: return X86::COND_INVALID;
4979 }
4980}
4981
4982/// Is there a floating point cmov for the specific X86 condition code?
4983/// Current x86 isa includes the following FP cmov instructions:
4984/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4985static bool hasFPCMov(unsigned X86CC) {
4986 switch (X86CC) {
4987 default:
4988 return false;
4989 case X86::COND_B:
4990 case X86::COND_BE:
4991 case X86::COND_E:
4992 case X86::COND_P:
4993 case X86::COND_A:
4994 case X86::COND_AE:
4995 case X86::COND_NE:
4996 case X86::COND_NP:
4997 return true;
4998 }
4999}
5000
5001
5002bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
5003 const CallInst &I,
5004 MachineFunction &MF,
5005 unsigned Intrinsic) const {
5006
5007 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
5008 if (!IntrData)
5009 return false;
5010
5011 Info.flags = MachineMemOperand::MONone;
5012 Info.offset = 0;
5013
5014 switch (IntrData->Type) {
5015 case TRUNCATE_TO_MEM_VI8:
5016 case TRUNCATE_TO_MEM_VI16:
5017 case TRUNCATE_TO_MEM_VI32: {
5018 Info.opc = ISD::INTRINSIC_VOID;
5019 Info.ptrVal = I.getArgOperand(0);
5020 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
5021 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
5022 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
5023 ScalarVT = MVT::i8;
5024 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
5025 ScalarVT = MVT::i16;
5026 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
5027 ScalarVT = MVT::i32;
5028
5029 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
5030 Info.align = Align(1);
5031 Info.flags |= MachineMemOperand::MOStore;
5032 break;
5033 }
5034 case GATHER:
5035 case GATHER_AVX2: {
5036 Info.opc = ISD::INTRINSIC_W_CHAIN;
5037 Info.ptrVal = nullptr;
5038 MVT DataVT = MVT::getVT(I.getType());
5039 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5040 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5041 IndexVT.getVectorNumElements());
5042 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5043 Info.align = Align(1);
5044 Info.flags |= MachineMemOperand::MOLoad;
5045 break;
5046 }
5047 case SCATTER: {
5048 Info.opc = ISD::INTRINSIC_VOID;
5049 Info.ptrVal = nullptr;
5050 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
5051 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
5052 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
5053 IndexVT.getVectorNumElements());
5054 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
5055 Info.align = Align(1);
5056 Info.flags |= MachineMemOperand::MOStore;
5057 break;
5058 }
5059 default:
5060 return false;
5061 }
5062
5063 return true;
5064}
5065
5066/// Returns true if the target can instruction select the
5067/// specified FP immediate natively. If false, the legalizer will
5068/// materialize the FP immediate as a load from a constant pool.
5069bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
5070 bool ForCodeSize) const {
5071 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
5072 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
5073 return true;
5074 }
5075 return false;
5076}
5077
5078bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
5079 ISD::LoadExtType ExtTy,
5080 EVT NewVT) const {
5081 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")((cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow"
) ? static_cast<void> (0) : __assert_fail ("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5081, __PRETTY_FUNCTION__))
;
5082
5083 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
5084 // relocation target a movq or addq instruction: don't let the load shrink.
5085 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
5086 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
5087 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
5088 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
5089
5090 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
5091 // those uses are extracted directly into a store, then the extract + store
5092 // can be store-folded. Therefore, it's probably not worth splitting the load.
5093 EVT VT = Load->getValueType(0);
5094 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
5095 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
5096 // Skip uses of the chain value. Result 0 of the node is the load value.
5097 if (UI.getUse().getResNo() != 0)
5098 continue;
5099
5100 // If this use is not an extract + store, it's probably worth splitting.
5101 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
5102 UI->use_begin()->getOpcode() != ISD::STORE)
5103 return true;
5104 }
5105 // All non-chain uses are extract + store.
5106 return false;
5107 }
5108
5109 return true;
5110}
5111
5112/// Returns true if it is beneficial to convert a load of a constant
5113/// to just the constant itself.
5114bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
5115 Type *Ty) const {
5116 assert(Ty->isIntegerTy())((Ty->isIntegerTy()) ? static_cast<void> (0) : __assert_fail
("Ty->isIntegerTy()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5116, __PRETTY_FUNCTION__))
;
5117
5118 unsigned BitSize = Ty->getPrimitiveSizeInBits();
5119 if (BitSize == 0 || BitSize > 64)
5120 return false;
5121 return true;
5122}
5123
5124bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {
5125 // If we are using XMM registers in the ABI and the condition of the select is
5126 // a floating-point compare and we have blendv or conditional move, then it is
5127 // cheaper to select instead of doing a cross-register move and creating a
5128 // load that depends on the compare result.
5129 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
5130 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
5131}
5132
5133bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
5134 // TODO: It might be a win to ease or lift this restriction, but the generic
5135 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
5136 if (VT.isVector() && Subtarget.hasAVX512())
5137 return false;
5138
5139 return true;
5140}
5141
5142bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
5143 SDValue C) const {
5144 // TODO: We handle scalars using custom code, but generic combining could make
5145 // that unnecessary.
5146 APInt MulC;
5147 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
5148 return false;
5149
5150 // Find the type this will be legalized too. Otherwise we might prematurely
5151 // convert this to shl+add/sub and then still have to type legalize those ops.
5152 // Another choice would be to defer the decision for illegal types until
5153 // after type legalization. But constant splat vectors of i64 can't make it
5154 // through type legalization on 32-bit targets so we would need to special
5155 // case vXi64.
5156 while (getTypeAction(Context, VT) != TypeLegal)
5157 VT = getTypeToTransformTo(Context, VT);
5158
5159 // If vector multiply is legal, assume that's faster than shl + add/sub.
5160 // TODO: Multiply is a complex op with higher latency and lower throughput in
5161 // most implementations, so this check could be loosened based on type
5162 // and/or a CPU attribute.
5163 if (isOperationLegal(ISD::MUL, VT))
5164 return false;
5165
5166 // shl+add, shl+sub, shl+add+neg
5167 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
5168 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
5169}
5170
5171bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
5172 unsigned Index) const {
5173 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
5174 return false;
5175
5176 // Mask vectors support all subregister combinations and operations that
5177 // extract half of vector.
5178 if (ResVT.getVectorElementType() == MVT::i1)
5179 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
5180 (Index == ResVT.getVectorNumElements()));
5181
5182 return (Index % ResVT.getVectorNumElements()) == 0;
5183}
5184
5185bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
5186 unsigned Opc = VecOp.getOpcode();
5187
5188 // Assume target opcodes can't be scalarized.
5189 // TODO - do we have any exceptions?
5190 if (Opc >= ISD::BUILTIN_OP_END)
5191 return false;
5192
5193 // If the vector op is not supported, try to convert to scalar.
5194 EVT VecVT = VecOp.getValueType();
5195 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
5196 return true;
5197
5198 // If the vector op is supported, but the scalar op is not, the transform may
5199 // not be worthwhile.
5200 EVT ScalarVT = VecVT.getScalarType();
5201 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
5202}
5203
5204bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
5205 bool) const {
5206 // TODO: Allow vectors?
5207 if (VT.isVector())
5208 return false;
5209 return VT.isSimple() || !isOperationExpand(Opcode, VT);
5210}
5211
5212bool X86TargetLowering::isCheapToSpeculateCttz() const {
5213 // Speculate cttz only if we can directly use TZCNT.
5214 return Subtarget.hasBMI();
5215}
5216
5217bool X86TargetLowering::isCheapToSpeculateCtlz() const {
5218 // Speculate ctlz only if we can directly use LZCNT.
5219 return Subtarget.hasLZCNT();
5220}
5221
5222bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
5223 const SelectionDAG &DAG,
5224 const MachineMemOperand &MMO) const {
5225 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
5226 BitcastVT.getVectorElementType() == MVT::i1)
5227 return false;
5228
5229 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
5230 return false;
5231
5232 // If both types are legal vectors, it's always ok to convert them.
5233 if (LoadVT.isVector() && BitcastVT.isVector() &&
5234 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
5235 return true;
5236
5237 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
5238}
5239
5240bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
5241 const SelectionDAG &DAG) const {
5242 // Do not merge to float value size (128 bytes) if no implicit
5243 // float attribute is set.
5244 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
5245 Attribute::NoImplicitFloat);
5246
5247 if (NoFloat) {
5248 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
5249 return (MemVT.getSizeInBits() <= MaxIntSize);
5250 }
5251 // Make sure we don't merge greater than our preferred vector
5252 // width.
5253 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
5254 return false;
5255 return true;
5256}
5257
5258bool X86TargetLowering::isCtlzFast() const {
5259 return Subtarget.hasFastLZCNT();
5260}
5261
5262bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
5263 const Instruction &AndI) const {
5264 return true;
5265}
5266
5267bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
5268 EVT VT = Y.getValueType();
5269
5270 if (VT.isVector())
5271 return false;
5272
5273 if (!Subtarget.hasBMI())
5274 return false;
5275
5276 // There are only 32-bit and 64-bit forms for 'andn'.
5277 if (VT != MVT::i32 && VT != MVT::i64)
5278 return false;
5279
5280 return !isa<ConstantSDNode>(Y);
5281}
5282
5283bool X86TargetLowering::hasAndNot(SDValue Y) const {
5284 EVT VT = Y.getValueType();
5285
5286 if (!VT.isVector())
5287 return hasAndNotCompare(Y);
5288
5289 // Vector.
5290
5291 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
5292 return false;
5293
5294 if (VT == MVT::v4i32)
5295 return true;
5296
5297 return Subtarget.hasSSE2();
5298}
5299
5300bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {
5301 return X.getValueType().isScalarInteger(); // 'bt'
5302}
5303
5304bool X86TargetLowering::
5305 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5306 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
5307 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
5308 SelectionDAG &DAG) const {
5309 // Does baseline recommend not to perform the fold by default?
5310 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
5311 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
5312 return false;
5313 // For scalars this transform is always beneficial.
5314 if (X.getValueType().isScalarInteger())
5315 return true;
5316 // If all the shift amounts are identical, then transform is beneficial even
5317 // with rudimentary SSE2 shifts.
5318 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
5319 return true;
5320 // If we have AVX2 with it's powerful shift operations, then it's also good.
5321 if (Subtarget.hasAVX2())
5322 return true;
5323 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
5324 return NewShiftOpcode == ISD::SHL;
5325}
5326
5327bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
5328 const SDNode *N, CombineLevel Level) const {
5329 assert(((N->getOpcode() == ISD::SHL &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5333, __PRETTY_FUNCTION__))
5330 N->getOperand(0).getOpcode() == ISD::SRL) ||((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5333, __PRETTY_FUNCTION__))
5331 (N->getOpcode() == ISD::SRL &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5333, __PRETTY_FUNCTION__))
5332 N->getOperand(0).getOpcode() == ISD::SHL)) &&((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5333, __PRETTY_FUNCTION__))
5333 "Expected shift-shift mask")((((N->getOpcode() == ISD::SHL && N->getOperand
(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL
&& N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask") ? static_cast<void> (0) :
__assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5333, __PRETTY_FUNCTION__))
;
5334 EVT VT = N->getValueType(0);
5335 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
5336 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
5337 // Only fold if the shift values are equal - so it folds to AND.
5338 // TODO - we should fold if either is a non-uniform vector but we don't do
5339 // the fold for non-splats yet.
5340 return N->getOperand(1) == N->getOperand(0).getOperand(1);
5341 }
5342 return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
5343}
5344
5345bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
5346 EVT VT = Y.getValueType();
5347
5348 // For vectors, we don't have a preference, but we probably want a mask.
5349 if (VT.isVector())
5350 return false;
5351
5352 // 64-bit shifts on 32-bit targets produce really bad bloated code.
5353 if (VT == MVT::i64 && !Subtarget.is64Bit())
5354 return false;
5355
5356 return true;
5357}
5358
5359bool X86TargetLowering::shouldExpandShift(SelectionDAG &DAG,
5360 SDNode *N) const {
5361 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
5362 !Subtarget.isOSWindows())
5363 return false;
5364 return true;
5365}
5366
5367bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
5368 // Any legal vector type can be splatted more efficiently than
5369 // loading/spilling from memory.
5370 return isTypeLegal(VT);
5371}
5372
5373MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
5374 MVT VT = MVT::getIntegerVT(NumBits);
5375 if (isTypeLegal(VT))
5376 return VT;
5377
5378 // PMOVMSKB can handle this.
5379 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
5380 return MVT::v16i8;
5381
5382 // VPMOVMSKB can handle this.
5383 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
5384 return MVT::v32i8;
5385
5386 // TODO: Allow 64-bit type for 32-bit target.
5387 // TODO: 512-bit types should be allowed, but make sure that those
5388 // cases are handled in combineVectorSizedSetCCEquality().
5389
5390 return MVT::INVALID_SIMPLE_VALUE_TYPE;
5391}
5392
5393/// Val is the undef sentinel value or equal to the specified value.
5394static bool isUndefOrEqual(int Val, int CmpVal) {
5395 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
5396}
5397
5398/// Val is either the undef or zero sentinel value.
5399static bool isUndefOrZero(int Val) {
5400 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
5401}
5402
5403/// Return true if every element in Mask, beginning from position Pos and ending
5404/// in Pos+Size is the undef sentinel value.
5405static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
5406 return llvm::all_of(Mask.slice(Pos, Size),
5407 [](int M) { return M == SM_SentinelUndef; });
5408}
5409
5410/// Return true if the mask creates a vector whose lower half is undefined.
5411static bool isUndefLowerHalf(ArrayRef<int> Mask) {
5412 unsigned NumElts = Mask.size();
5413 return isUndefInRange(Mask, 0, NumElts / 2);
5414}
5415
5416/// Return true if the mask creates a vector whose upper half is undefined.
5417static bool isUndefUpperHalf(ArrayRef<int> Mask) {
5418 unsigned NumElts = Mask.size();
5419 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
5420}
5421
5422/// Return true if Val falls within the specified range (L, H].
5423static bool isInRange(int Val, int Low, int Hi) {
5424 return (Val >= Low && Val < Hi);
5425}
5426
5427/// Return true if the value of any element in Mask falls within the specified
5428/// range (L, H].
5429static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
5430 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
5431}
5432
5433/// Return true if Val is undef or if its value falls within the
5434/// specified range (L, H].
5435static bool isUndefOrInRange(int Val, int Low, int Hi) {
5436 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
5437}
5438
5439/// Return true if every element in Mask is undef or if its value
5440/// falls within the specified range (L, H].
5441static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5442 return llvm::all_of(
5443 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
5444}
5445
5446/// Return true if Val is undef, zero or if its value falls within the
5447/// specified range (L, H].
5448static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
5449 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
5450}
5451
5452/// Return true if every element in Mask is undef, zero or if its value
5453/// falls within the specified range (L, H].
5454static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
5455 return llvm::all_of(
5456 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
5457}
5458
5459/// Return true if every element in Mask, beginning
5460/// from position Pos and ending in Pos + Size, falls within the specified
5461/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
5462static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
5463 unsigned Size, int Low, int Step = 1) {
5464 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5465 if (!isUndefOrEqual(Mask[i], Low))
5466 return false;
5467 return true;
5468}
5469
5470/// Return true if every element in Mask, beginning
5471/// from position Pos and ending in Pos+Size, falls within the specified
5472/// sequential range (Low, Low+Size], or is undef or is zero.
5473static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5474 unsigned Size, int Low,
5475 int Step = 1) {
5476 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
5477 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
5478 return false;
5479 return true;
5480}
5481
5482/// Return true if every element in Mask, beginning
5483/// from position Pos and ending in Pos+Size is undef or is zero.
5484static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
5485 unsigned Size) {
5486 return llvm::all_of(Mask.slice(Pos, Size),
5487 [](int M) { return isUndefOrZero(M); });
5488}
5489
5490/// Helper function to test whether a shuffle mask could be
5491/// simplified by widening the elements being shuffled.
5492///
5493/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
5494/// leaves it in an unspecified state.
5495///
5496/// NOTE: This must handle normal vector shuffle masks and *target* vector
5497/// shuffle masks. The latter have the special property of a '-2' representing
5498/// a zero-ed lane of a vector.
5499static bool canWidenShuffleElements(ArrayRef<int> Mask,
5500 SmallVectorImpl<int> &WidenedMask) {
5501 WidenedMask.assign(Mask.size() / 2, 0);
5502 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
5503 int M0 = Mask[i];
5504 int M1 = Mask[i + 1];
5505
5506 // If both elements are undef, its trivial.
5507 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
5508 WidenedMask[i / 2] = SM_SentinelUndef;
5509 continue;
5510 }
5511
5512 // Check for an undef mask and a mask value properly aligned to fit with
5513 // a pair of values. If we find such a case, use the non-undef mask's value.
5514 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
5515 WidenedMask[i / 2] = M1 / 2;
5516 continue;
5517 }
5518 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
5519 WidenedMask[i / 2] = M0 / 2;
5520 continue;
5521 }
5522
5523 // When zeroing, we need to spread the zeroing across both lanes to widen.
5524 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
5525 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
5526 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
5527 WidenedMask[i / 2] = SM_SentinelZero;
5528 continue;
5529 }
5530 return false;
5531 }
5532
5533 // Finally check if the two mask values are adjacent and aligned with
5534 // a pair.
5535 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
5536 WidenedMask[i / 2] = M0 / 2;
5537 continue;
5538 }
5539
5540 // Otherwise we can't safely widen the elements used in this shuffle.
5541 return false;
5542 }
5543 assert(WidenedMask.size() == Mask.size() / 2 &&((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5544, __PRETTY_FUNCTION__))
5544 "Incorrect size of mask after widening the elements!")((WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"
) ? static_cast<void> (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5544, __PRETTY_FUNCTION__))
;
5545
5546 return true;
5547}
5548
5549static bool canWidenShuffleElements(ArrayRef<int> Mask,
5550 const APInt &Zeroable,
5551 bool V2IsZero,
5552 SmallVectorImpl<int> &WidenedMask) {
5553 // Create an alternative mask with info about zeroable elements.
5554 // Here we do not set undef elements as zeroable.
5555 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
5556 if (V2IsZero) {
5557 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!")((!Zeroable.isNullValue() && "V2's non-undef elements are used?!"
) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isNullValue() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5557, __PRETTY_FUNCTION__))
;
5558 for (int i = 0, Size = Mask.size(); i != Size; ++i)
5559 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
5560 ZeroableMask[i] = SM_SentinelZero;
5561 }
5562 return canWidenShuffleElements(ZeroableMask, WidenedMask);
5563}
5564
5565static bool canWidenShuffleElements(ArrayRef<int> Mask) {
5566 SmallVector<int, 32> WidenedMask;
5567 return canWidenShuffleElements(Mask, WidenedMask);
5568}
5569
5570/// Returns true if Elt is a constant zero or a floating point constant +0.0.
5571bool X86::isZeroNode(SDValue Elt) {
5572 return isNullConstant(Elt) || isNullFPConstant(Elt);
5573}
5574
5575// Build a vector of constants.
5576// Use an UNDEF node if MaskElt == -1.
5577// Split 64-bit constants in the 32-bit mode.
5578static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5579 const SDLoc &dl, bool IsMask = false) {
5580
5581 SmallVector<SDValue, 32> Ops;
5582 bool Split = false;
5583
5584 MVT ConstVecVT = VT;
5585 unsigned NumElts = VT.getVectorNumElements();
5586 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5587 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5588 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5589 Split = true;
5590 }
5591
5592 MVT EltVT = ConstVecVT.getVectorElementType();
5593 for (unsigned i = 0; i < NumElts; ++i) {
5594 bool IsUndef = Values[i] < 0 && IsMask;
5595 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5596 DAG.getConstant(Values[i], dl, EltVT);
5597 Ops.push_back(OpNode);
5598 if (Split)
5599 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5600 DAG.getConstant(0, dl, EltVT));
5601 }
5602 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5603 if (Split)
5604 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5605 return ConstsNode;
5606}
5607
5608static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5609 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5610 assert(Bits.size() == Undefs.getBitWidth() &&((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"
) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5611, __PRETTY_FUNCTION__))
5611 "Unequal constant and undef arrays")((Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"
) ? static_cast<void> (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5611, __PRETTY_FUNCTION__))
;
5612 SmallVector<SDValue, 32> Ops;
5613 bool Split = false;
5614
5615 MVT ConstVecVT = VT;
5616 unsigned NumElts = VT.getVectorNumElements();
5617 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5618 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5619 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5620 Split = true;
5621 }
5622
5623 MVT EltVT = ConstVecVT.getVectorElementType();
5624 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5625 if (Undefs[i]) {
5626 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5627 continue;
5628 }
5629 const APInt &V = Bits[i];
5630 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")((V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"
) ? static_cast<void> (0) : __assert_fail ("V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5630, __PRETTY_FUNCTION__))
;
5631 if (Split) {
5632 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5633 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5634 } else if (EltVT == MVT::f32) {
5635 APFloat FV(APFloat::IEEEsingle(), V);
5636 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5637 } else if (EltVT == MVT::f64) {
5638 APFloat FV(APFloat::IEEEdouble(), V);
5639 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5640 } else {
5641 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5642 }
5643 }
5644
5645 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5646 return DAG.getBitcast(VT, ConstsNode);
5647}
5648
5649/// Returns a vector of specified type with all zero elements.
5650static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5651 SelectionDAG &DAG, const SDLoc &dl) {
5652 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5654, __PRETTY_FUNCTION__))
5653 VT.getVectorElementType() == MVT::i1) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5654, __PRETTY_FUNCTION__))
5654 "Unexpected vector type")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5654, __PRETTY_FUNCTION__))
;
5655
5656 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5657 // type. This ensures they get CSE'd. But if the integer type is not
5658 // available, use a floating-point +0.0 instead.
5659 SDValue Vec;
5660 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5661 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5662 } else if (VT.isFloatingPoint()) {
5663 Vec = DAG.getConstantFP(+0.0, dl, VT);
5664 } else if (VT.getVectorElementType() == MVT::i1) {
5665 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type") ? static_cast<void> (0) : __assert_fail
("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5666, __PRETTY_FUNCTION__))
5666 "Unexpected vector type")(((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
"Unexpected vector type") ? static_cast<void> (0) : __assert_fail
("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5666, __PRETTY_FUNCTION__))
;
5667 Vec = DAG.getConstant(0, dl, VT);
5668 } else {
5669 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5670 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5671 }
5672 return DAG.getBitcast(VT, Vec);
5673}
5674
5675static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5676 const SDLoc &dl, unsigned vectorWidth) {
5677 EVT VT = Vec.getValueType();
5678 EVT ElVT = VT.getVectorElementType();
5679 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5680 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5681 VT.getVectorNumElements()/Factor);
5682
5683 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5684 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5685 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5685, __PRETTY_FUNCTION__))
;
5686
5687 // This is the index of the first element of the vectorWidth-bit chunk
5688 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5689 IdxVal &= ~(ElemsPerChunk - 1);
5690
5691 // If the input is a buildvector just emit a smaller one.
5692 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5693 return DAG.getBuildVector(ResultVT, dl,
5694 Vec->ops().slice(IdxVal, ElemsPerChunk));
5695
5696 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5697 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5698}
5699
5700/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5701/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5702/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5703/// instructions or a simple subregister reference. Idx is an index in the
5704/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5705/// lowering EXTRACT_VECTOR_ELT operations easier.
5706static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5707 SelectionDAG &DAG, const SDLoc &dl) {
5708 assert((Vec.getValueType().is256BitVector() ||(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5709, __PRETTY_FUNCTION__))
5709 Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(((Vec.getValueType().is256BitVector() || Vec.getValueType().
is512BitVector()) && "Unexpected vector size!") ? static_cast
<void> (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5709, __PRETTY_FUNCTION__))
;
5710 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5711}
5712
5713/// Generate a DAG to grab 256-bits from a 512-bit vector.
5714static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5715 SelectionDAG &DAG, const SDLoc &dl) {
5716 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")((Vec.getValueType().is512BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5716, __PRETTY_FUNCTION__))
;
5717 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5718}
5719
5720static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5721 SelectionDAG &DAG, const SDLoc &dl,
5722 unsigned vectorWidth) {
5723 assert((vectorWidth == 128 || vectorWidth == 256) &&(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5724, __PRETTY_FUNCTION__))
5724 "Unsupported vector width")(((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"
) ? static_cast<void> (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5724, __PRETTY_FUNCTION__))
;
5725 // Inserting UNDEF is Result
5726 if (Vec.isUndef())
5727 return Result;
5728 EVT VT = Vec.getValueType();
5729 EVT ElVT = VT.getVectorElementType();
5730 EVT ResultVT = Result.getValueType();
5731
5732 // Insert the relevant vectorWidth bits.
5733 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5734 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5734, __PRETTY_FUNCTION__))
;
5735
5736 // This is the index of the first element of the vectorWidth-bit chunk
5737 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5738 IdxVal &= ~(ElemsPerChunk - 1);
5739
5740 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5741 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5742}
5743
5744/// Generate a DAG to put 128-bits into a vector > 128 bits. This
5745/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5746/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5747/// simple superregister reference. Idx is an index in the 128 bits
5748/// we want. It need not be aligned to a 128-bit boundary. That makes
5749/// lowering INSERT_VECTOR_ELT operations easier.
5750static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5751 SelectionDAG &DAG, const SDLoc &dl) {
5752 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")((Vec.getValueType().is128BitVector() && "Unexpected vector size!"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5752, __PRETTY_FUNCTION__))
;
5753 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5754}
5755
5756/// Widen a vector to a larger size with the same scalar type, with the new
5757/// elements either zero or undef.
5758static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5759 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5760 const SDLoc &dl) {
5761 assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5763, __PRETTY_FUNCTION__))
5762 Vec.getValueType().getScalarType() == VT.getScalarType() &&((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5763, __PRETTY_FUNCTION__))
5763 "Unsupported vector widening type")((Vec.getValueSizeInBits() < VT.getSizeInBits() &&
Vec.getValueType().getScalarType() == VT.getScalarType() &&
"Unsupported vector widening type") ? static_cast<void>
(0) : __assert_fail ("Vec.getValueSizeInBits() < VT.getSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5763, __PRETTY_FUNCTION__))
;
5764 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5765 : DAG.getUNDEF(VT);
5766 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5767 DAG.getIntPtrConstant(0, dl));
5768}
5769
5770/// Widen a vector to a larger size with the same scalar type, with the new
5771/// elements either zero or undef.
5772static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
5773 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5774 const SDLoc &dl, unsigned WideSizeInBits) {
5775 assert(Vec.getValueSizeInBits() < WideSizeInBits &&((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5777, __PRETTY_FUNCTION__))
5776 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5777, __PRETTY_FUNCTION__))
5777 "Unsupported vector widening type")((Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits
% Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"
) ? static_cast<void> (0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5777, __PRETTY_FUNCTION__))
;
5778 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
5779 MVT SVT = Vec.getSimpleValueType().getScalarType();
5780 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
5781 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
5782}
5783
5784// Helper function to collect subvector ops that are concatenated together,
5785// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
5786// The subvectors in Ops are guaranteed to be the same type.
5787static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
5788 assert(Ops.empty() && "Expected an empty ops vector")((Ops.empty() && "Expected an empty ops vector") ? static_cast
<void> (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5788, __PRETTY_FUNCTION__))
;
5789
5790 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
5791 Ops.append(N->op_begin(), N->op_end());
5792 return true;
5793 }
5794
5795 if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
5796 isa<ConstantSDNode>(N->getOperand(2))) {
5797 SDValue Src = N->getOperand(0);
5798 SDValue Sub = N->getOperand(1);
5799 const APInt &Idx = N->getConstantOperandAPInt(2);
5800 EVT VT = Src.getValueType();
5801 EVT SubVT = Sub.getValueType();
5802
5803 // TODO - Handle more general insert_subvector chains.
5804 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
5805 Idx == (VT.getVectorNumElements() / 2) &&
5806 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5807 Src.getOperand(1).getValueType() == SubVT &&
5808 isNullConstant(Src.getOperand(2))) {
5809 Ops.push_back(Src.getOperand(1));
5810 Ops.push_back(Sub);
5811 return true;
5812 }
5813 }
5814
5815 return false;
5816}
5817
5818// Helper for splitting operands of an operation to legal target size and
5819// apply a function on each part.
5820// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
5821// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
5822// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
5823// The argument Builder is a function that will be applied on each split part:
5824// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
5825template <typename F>
5826SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5827 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
5828 F Builder, bool CheckBWI = true) {
5829 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")((Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5829, __PRETTY_FUNCTION__))
;
5830 unsigned NumSubs = 1;
5831 if ((CheckBWI && Subtarget.useBWIRegs()) ||
5832 (!CheckBWI && Subtarget.useAVX512Regs())) {
5833 if (VT.getSizeInBits() > 512) {
5834 NumSubs = VT.getSizeInBits() / 512;
5835 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5835, __PRETTY_FUNCTION__))
;
5836 }
5837 } else if (Subtarget.hasAVX2()) {
5838 if (VT.getSizeInBits() > 256) {
5839 NumSubs = VT.getSizeInBits() / 256;
5840 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 256) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5840, __PRETTY_FUNCTION__))
;
5841 }
5842 } else {
5843 if (VT.getSizeInBits() > 128) {
5844 NumSubs = VT.getSizeInBits() / 128;
5845 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(((VT.getSizeInBits() % 128) == 0 && "Illegal vector size"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5845, __PRETTY_FUNCTION__))
;
5846 }
5847 }
5848
5849 if (NumSubs == 1)
5850 return Builder(DAG, DL, Ops);
5851
5852 SmallVector<SDValue, 4> Subs;
5853 for (unsigned i = 0; i != NumSubs; ++i) {
5854 SmallVector<SDValue, 2> SubOps;
5855 for (SDValue Op : Ops) {
5856 EVT OpVT = Op.getValueType();
5857 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
5858 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
5859 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
5860 }
5861 Subs.push_back(Builder(DAG, DL, SubOps));
5862 }
5863 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
5864}
5865
5866/// Insert i1-subvector to i1-vector.
5867static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5868 const X86Subtarget &Subtarget) {
5869
5870 SDLoc dl(Op);
5871 SDValue Vec = Op.getOperand(0);
5872 SDValue SubVec = Op.getOperand(1);
5873 SDValue Idx = Op.getOperand(2);
5874
5875 if (!isa<ConstantSDNode>(Idx))
5876 return SDValue();
5877
5878 // Inserting undef is a nop. We can just return the original vector.
5879 if (SubVec.isUndef())
5880 return Vec;
5881
5882 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5883 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5884 return Op;
5885
5886 MVT OpVT = Op.getSimpleValueType();
5887 unsigned NumElems = OpVT.getVectorNumElements();
5888
5889 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5890
5891 // Extend to natively supported kshift.
5892 MVT WideOpVT = OpVT;
5893 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5894 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5895
5896 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5897 // if necessary.
5898 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5899 // May need to promote to a legal type.
5900 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5901 DAG.getConstant(0, dl, WideOpVT),
5902 SubVec, Idx);
5903 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5904 }
5905
5906 MVT SubVecVT = SubVec.getSimpleValueType();
5907 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5908
5909 assert(IdxVal + SubVecNumElems <= NumElems &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5911, __PRETTY_FUNCTION__))
5910 IdxVal % SubVecVT.getSizeInBits() == 0 &&((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5911, __PRETTY_FUNCTION__))
5911 "Unexpected index value in INSERT_SUBVECTOR")((IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT
.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"
) ? static_cast<void> (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5911, __PRETTY_FUNCTION__))
;
5912
5913 SDValue Undef = DAG.getUNDEF(WideOpVT);
5914
5915 if (IdxVal == 0) {
5916 // Zero lower bits of the Vec
5917 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
5918 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5919 ZeroIdx);
5920 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5921 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5922 // Merge them together, SubVec should be zero extended.
5923 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5924 DAG.getConstant(0, dl, WideOpVT),
5925 SubVec, ZeroIdx);
5926 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5927 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5928 }
5929
5930 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5931 Undef, SubVec, ZeroIdx);
5932
5933 if (Vec.isUndef()) {
5934 assert(IdxVal != 0 && "Unexpected index")((IdxVal != 0 && "Unexpected index") ? static_cast<
void> (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5934, __PRETTY_FUNCTION__))
;
5935 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5936 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
5937 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5938 }
5939
5940 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5941 assert(IdxVal != 0 && "Unexpected index")((IdxVal != 0 && "Unexpected index") ? static_cast<
void> (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5941, __PRETTY_FUNCTION__))
;
5942 NumElems = WideOpVT.getVectorNumElements();
5943 unsigned ShiftLeft = NumElems - SubVecNumElems;
5944 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5945 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5946 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
5947 if (ShiftRight != 0)
5948 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5949 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
5950 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5951 }
5952
5953 // Simple case when we put subvector in the upper part
5954 if (IdxVal + SubVecNumElems == NumElems) {
5955 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5956 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
5957 if (SubVecNumElems * 2 == NumElems) {
5958 // Special case, use legal zero extending insert_subvector. This allows
5959 // isel to optimize when bits are known zero.
5960 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5961 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5962 DAG.getConstant(0, dl, WideOpVT),
5963 Vec, ZeroIdx);
5964 } else {
5965 // Otherwise use explicit shifts to zero the bits.
5966 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5967 Undef, Vec, ZeroIdx);
5968 NumElems = WideOpVT.getVectorNumElements();
5969 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
5970 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5971 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5972 }
5973 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5974 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5975 }
5976
5977 // Inserting into the middle is more complicated.
5978
5979 NumElems = WideOpVT.getVectorNumElements();
5980
5981 // Widen the vector if needed.
5982 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5983
5984 unsigned ShiftLeft = NumElems - SubVecNumElems;
5985 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5986
5987 // Do an optimization for the the most frequently used types.
5988 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
5989 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
5990 Mask0.flipAllBits();
5991 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
5992 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
5993 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
5994 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5995 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
5996 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5997 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
5998 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5999
6000 // Reduce to original width if needed.
6001 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
6002 }
6003
6004 // Clear the upper bits of the subvector and move it to its insert position.
6005 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
6006 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
6007 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
6008 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
6009
6010 // Isolate the bits below the insertion point.
6011 unsigned LowShift = NumElems - IdxVal;
6012 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
6013 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6014 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
6015 DAG.getTargetConstant(LowShift, dl, MVT::i8));
6016
6017 // Isolate the bits after the last inserted bit.
6018 unsigned HighShift = IdxVal + SubVecNumElems;
6019 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
6020 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6021 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
6022 DAG.getTargetConstant(HighShift, dl, MVT::i8));
6023
6024 // Now OR all 3 pieces together.
6025 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
6026 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
6027
6028 // Reduce to original width if needed.
6029 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
6030}
6031
6032static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,
6033 const SDLoc &dl) {
6034 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")((V1.getValueType() == V2.getValueType() && "subvector type mismatch"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6034, __PRETTY_FUNCTION__))
;
6035 EVT SubVT = V1.getValueType();
6036 EVT SubSVT = SubVT.getScalarType();
6037 unsigned SubNumElts = SubVT.getVectorNumElements();
6038 unsigned SubVectorWidth = SubVT.getSizeInBits();
6039 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
6040 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
6041 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
6042}
6043
6044/// Returns a vector of specified type with all bits set.
6045/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
6046/// Then bitcast to their original type, ensuring they get CSE'd.
6047static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6048 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected a 128/256/512-bit vector type") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6049, __PRETTY_FUNCTION__))
6049 "Expected a 128/256/512-bit vector type")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected a 128/256/512-bit vector type") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6049, __PRETTY_FUNCTION__))
;
6050
6051 APInt Ones = APInt::getAllOnesValue(32);
6052 unsigned NumElts = VT.getSizeInBits() / 32;
6053 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
6054 return DAG.getBitcast(VT, Vec);
6055}
6056
6057// Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
6058static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode) {
6059 switch (Opcode) {
6060 case ISD::ANY_EXTEND:
6061 case ISD::ANY_EXTEND_VECTOR_INREG:
6062 return ISD::ANY_EXTEND_VECTOR_INREG;
6063 case ISD::ZERO_EXTEND:
6064 case ISD::ZERO_EXTEND_VECTOR_INREG:
6065 return ISD::ZERO_EXTEND_VECTOR_INREG;
6066 case ISD::SIGN_EXTEND:
6067 case ISD::SIGN_EXTEND_VECTOR_INREG:
6068 return ISD::SIGN_EXTEND_VECTOR_INREG;
6069 }
6070 llvm_unreachable("Unknown opcode")::llvm::llvm_unreachable_internal("Unknown opcode", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6070)
;
6071}
6072
6073static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
6074 SDValue In, SelectionDAG &DAG) {
6075 EVT InVT = In.getValueType();
6076 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")((VT.isVector() && InVT.isVector() && "Expected vector VTs."
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6076, __PRETTY_FUNCTION__))
;
6077 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6079, __PRETTY_FUNCTION__))
6078 ISD::ZERO_EXTEND == Opcode) &&(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6079, __PRETTY_FUNCTION__))
6079 "Unknown extension opcode")(((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode"
) ? static_cast<void> (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6079, __PRETTY_FUNCTION__))
;
6080
6081 // For 256-bit vectors, we only need the lower (128-bit) input half.
6082 // For 512-bit vectors, we only need the lower input half or quarter.
6083 if (InVT.getSizeInBits() > 128) {
6084 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&((VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6085, __PRETTY_FUNCTION__))
6085 "Expected VTs to be the same size!")((VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6085, __PRETTY_FUNCTION__))
;
6086 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
6087 In = extractSubVector(In, 0, DAG, DL,
6088 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
6089 InVT = In.getValueType();
6090 }
6091
6092 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
6093 Opcode = getOpcode_EXTEND_VECTOR_INREG(Opcode);
6094
6095 return DAG.getNode(Opcode, DL, VT, In);
6096}
6097
6098// Match (xor X, -1) -> X.
6099// Match extract_subvector(xor X, -1) -> extract_subvector(X).
6100// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
6101static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
6102 V = peekThroughBitcasts(V);
6103 if (V.getOpcode() == ISD::XOR &&
6104 ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
6105 return V.getOperand(0);
6106 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6107 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
6108 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
6109 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
6110 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
6111 Not, V.getOperand(1));
6112 }
6113 }
6114 SmallVector<SDValue, 2> CatOps;
6115 if (collectConcatOps(V.getNode(), CatOps)) {
6116 for (SDValue &CatOp : CatOps) {
6117 SDValue NotCat = IsNOT(CatOp, DAG);
6118 if (!NotCat) return SDValue();
6119 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
6120 }
6121 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
6122 }
6123 return SDValue();
6124}
6125
6126/// Returns a vector_shuffle node for an unpackl operation.
6127static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
6128 SDValue V1, SDValue V2) {
6129 SmallVector<int, 8> Mask;
6130 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
6131 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6132}
6133
6134/// Returns a vector_shuffle node for an unpackh operation.
6135static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
6136 SDValue V1, SDValue V2) {
6137 SmallVector<int, 8> Mask;
6138 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
6139 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
6140}
6141
6142/// Return a vector_shuffle of the specified vector of zero or undef vector.
6143/// This produces a shuffle where the low element of V2 is swizzled into the
6144/// zero/undef vector, landing at element Idx.
6145/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
6146static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
6147 bool IsZero,
6148 const X86Subtarget &Subtarget,
6149 SelectionDAG &DAG) {
6150 MVT VT = V2.getSimpleValueType();
6151 SDValue V1 = IsZero
6152 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
6153 int NumElems = VT.getVectorNumElements();
6154 SmallVector<int, 16> MaskVec(NumElems);
6155 for (int i = 0; i != NumElems; ++i)
6156 // If this is the insertion idx, put the low elt of V2 here.
6157 MaskVec[i] = (i == Idx) ? NumElems : i;
6158 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
6159}
6160
6161static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
6162 if (!Load || !ISD::isNormalLoad(Load))
6163 return nullptr;
6164
6165 SDValue Ptr = Load->getBasePtr();
6166 if (Ptr->getOpcode() == X86ISD::Wrapper ||
6167 Ptr->getOpcode() == X86ISD::WrapperRIP)
6168 Ptr = Ptr->getOperand(0);
6169
6170 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6171 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
6172 return nullptr;
6173
6174 return CNode->getConstVal();
6175}
6176
6177static const Constant *getTargetConstantFromNode(SDValue Op) {
6178 Op = peekThroughBitcasts(Op);
6179 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
6180}
6181
6182const Constant *
6183X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {
6184 assert(LD && "Unexpected null LoadSDNode")((LD && "Unexpected null LoadSDNode") ? static_cast<
void> (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6184, __PRETTY_FUNCTION__))
;
6185 return getTargetConstantFromNode(LD);
6186}
6187
6188// Extract raw constant bits from constant pools.
6189static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
6190 APInt &UndefElts,
6191 SmallVectorImpl<APInt> &EltBits,
6192 bool AllowWholeUndefs = true,
6193 bool AllowPartialUndefs = true) {
6194 assert(EltBits.empty() && "Expected an empty EltBits vector")((EltBits.empty() && "Expected an empty EltBits vector"
) ? static_cast<void> (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6194, __PRETTY_FUNCTION__))
;
6195
6196 Op = peekThroughBitcasts(Op);
6197
6198 EVT VT = Op.getValueType();
6199 unsigned SizeInBits = VT.getSizeInBits();
6200 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!"
) ? static_cast<void> (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6200, __PRETTY_FUNCTION__))
;
6201 unsigned NumElts = SizeInBits / EltSizeInBits;
6202
6203 // Bitcast a source array of element bits to the target size.
6204 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
6205 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
6206 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
6207 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"
) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6208, __PRETTY_FUNCTION__))
6208 "Constant bit sizes don't match")(((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"
) ? static_cast<void> (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6208, __PRETTY_FUNCTION__))
;
6209
6210 // Don't split if we don't allow undef bits.
6211 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
6212 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
6213 return false;
6214
6215 // If we're already the right size, don't bother bitcasting.
6216 if (NumSrcElts == NumElts) {
6217 UndefElts = UndefSrcElts;
6218 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
6219 return true;
6220 }
6221
6222 // Extract all the undef/constant element data and pack into single bitsets.
6223 APInt UndefBits(SizeInBits, 0);
6224 APInt MaskBits(SizeInBits, 0);
6225
6226 for (unsigned i = 0; i != NumSrcElts; ++i) {
6227 unsigned BitOffset = i * SrcEltSizeInBits;
6228 if (UndefSrcElts[i])
6229 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
6230 MaskBits.insertBits(SrcEltBits[i], BitOffset);
6231 }
6232
6233 // Split the undef/constant single bitset data into the target elements.
6234 UndefElts = APInt(NumElts, 0);
6235 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
6236
6237 for (unsigned i = 0; i != NumElts; ++i) {
6238 unsigned BitOffset = i * EltSizeInBits;
6239 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
6240
6241 // Only treat an element as UNDEF if all bits are UNDEF.
6242 if (UndefEltBits.isAllOnesValue()) {
6243 if (!AllowWholeUndefs)
6244 return false;
6245 UndefElts.setBit(i);
6246 continue;
6247 }
6248
6249 // If only some bits are UNDEF then treat them as zero (or bail if not
6250 // supported).
6251 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
6252 return false;
6253
6254 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
6255 }
6256 return true;
6257 };
6258
6259 // Collect constant bits and insert into mask/undef bit masks.
6260 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
6261 unsigned UndefBitIndex) {
6262 if (!Cst)
6263 return false;
6264 if (isa<UndefValue>(Cst)) {
6265 Undefs.setBit(UndefBitIndex);
6266 return true;
6267 }
6268 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
6269 Mask = CInt->getValue();
6270 return true;
6271 }
6272 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
6273 Mask = CFP->getValueAPF().bitcastToAPInt();
6274 return true;
6275 }
6276 return false;
6277 };
6278
6279 // Handle UNDEFs.
6280 if (Op.isUndef()) {
6281 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
6282 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
6283 return CastBitData(UndefSrcElts, SrcEltBits);
6284 }
6285
6286 // Extract scalar constant bits.
6287 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
6288 APInt UndefSrcElts = APInt::getNullValue(1);
6289 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
6290 return CastBitData(UndefSrcElts, SrcEltBits);
6291 }
6292 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
6293 APInt UndefSrcElts = APInt::getNullValue(1);
6294 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6295 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
6296 return CastBitData(UndefSrcElts, SrcEltBits);
6297 }
6298
6299 // Extract constant bits from build vector.
6300 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6301 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6302 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6303
6304 APInt UndefSrcElts(NumSrcElts, 0);
6305 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6306 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6307 const SDValue &Src = Op.getOperand(i);
6308 if (Src.isUndef()) {
6309 UndefSrcElts.setBit(i);
6310 continue;
6311 }
6312 auto *Cst = cast<ConstantSDNode>(Src);
6313 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
6314 }
6315 return CastBitData(UndefSrcElts, SrcEltBits);
6316 }
6317 if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
6318 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6319 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6320
6321 APInt UndefSrcElts(NumSrcElts, 0);
6322 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6323 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
6324 const SDValue &Src = Op.getOperand(i);
6325 if (Src.isUndef()) {
6326 UndefSrcElts.setBit(i);
6327 continue;
6328 }
6329 auto *Cst = cast<ConstantFPSDNode>(Src);
6330 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
6331 SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
6332 }
6333 return CastBitData(UndefSrcElts, SrcEltBits);
6334 }
6335
6336 // Extract constant bits from constant pool vector.
6337 if (auto *Cst = getTargetConstantFromNode(Op)) {
6338 Type *CstTy = Cst->getType();
6339 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
6340 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
6341 return false;
6342
6343 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
6344 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6345
6346 APInt UndefSrcElts(NumSrcElts, 0);
6347 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
6348 for (unsigned i = 0; i != NumSrcElts; ++i)
6349 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
6350 UndefSrcElts, i))
6351 return false;
6352
6353 return CastBitData(UndefSrcElts, SrcEltBits);
6354 }
6355
6356 // Extract constant bits from a broadcasted constant pool scalar.
6357 if (Op.getOpcode() == X86ISD::VBROADCAST &&
6358 EltSizeInBits <= VT.getScalarSizeInBits()) {
6359 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
6360 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
6361 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6362
6363 APInt UndefSrcElts(NumSrcElts, 0);
6364 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6365 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
6366 if (UndefSrcElts[0])
6367 UndefSrcElts.setBits(0, NumSrcElts);
6368 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6369 return CastBitData(UndefSrcElts, SrcEltBits);
6370 }
6371 }
6372 }
6373
6374 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
6375 EltSizeInBits <= VT.getScalarSizeInBits()) {
6376 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
6377 if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())
6378 return false;
6379
6380 SDValue Ptr = MemIntr->getBasePtr();
6381 if (Ptr->getOpcode() == X86ISD::Wrapper ||
6382 Ptr->getOpcode() == X86ISD::WrapperRIP)
6383 Ptr = Ptr->getOperand(0);
6384
6385 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
6386 if (!CNode || CNode->isMachineConstantPoolEntry() ||
6387 CNode->getOffset() != 0)
6388 return false;
6389
6390 if (const Constant *C = CNode->getConstVal()) {
6391 unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
6392 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6393
6394 APInt UndefSrcElts(NumSrcElts, 0);
6395 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
6396 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
6397 if (UndefSrcElts[0])
6398 UndefSrcElts.setBits(0, NumSrcElts);
6399 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
6400 return CastBitData(UndefSrcElts, SrcEltBits);
6401 }
6402 }
6403 }
6404
6405 // Extract constant bits from a subvector broadcast.
6406 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) {
6407 SmallVector<APInt, 16> SubEltBits;
6408 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6409 UndefElts, SubEltBits, AllowWholeUndefs,
6410 AllowPartialUndefs)) {
6411 UndefElts = APInt::getSplat(NumElts, UndefElts);
6412 while (EltBits.size() < NumElts)
6413 EltBits.append(SubEltBits.begin(), SubEltBits.end());
6414 return true;
6415 }
6416 }
6417
6418 // Extract a rematerialized scalar constant insertion.
6419 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
6420 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
6421 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
6422 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
6423 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
6424
6425 APInt UndefSrcElts(NumSrcElts, 0);
6426 SmallVector<APInt, 64> SrcEltBits;
6427 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
6428 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
6429 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
6430 return CastBitData(UndefSrcElts, SrcEltBits);
6431 }
6432
6433 // Insert constant bits from a base and sub vector sources.
6434 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
6435 isa<ConstantSDNode>(Op.getOperand(2))) {
6436 // TODO - support insert_subvector through bitcasts.
6437 if (EltSizeInBits != VT.getScalarSizeInBits())
6438 return false;
6439
6440 APInt UndefSubElts;
6441 SmallVector<APInt, 32> EltSubBits;
6442 if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6443 UndefSubElts, EltSubBits,
6444 AllowWholeUndefs, AllowPartialUndefs) &&
6445 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6446 UndefElts, EltBits, AllowWholeUndefs,
6447 AllowPartialUndefs)) {
6448 unsigned BaseIdx = Op.getConstantOperandVal(2);
6449 UndefElts.insertBits(UndefSubElts, BaseIdx);
6450 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
6451 EltBits[BaseIdx + i] = EltSubBits[i];
6452 return true;
6453 }
6454 }
6455
6456 // Extract constant bits from a subvector's source.
6457 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6458 isa<ConstantSDNode>(Op.getOperand(1))) {
6459 // TODO - support extract_subvector through bitcasts.
6460 if (EltSizeInBits != VT.getScalarSizeInBits())
6461 return false;
6462
6463 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6464 UndefElts, EltBits, AllowWholeUndefs,
6465 AllowPartialUndefs)) {
6466 EVT SrcVT = Op.getOperand(0).getValueType();
6467 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6468 unsigned NumSubElts = VT.getVectorNumElements();
6469 unsigned BaseIdx = Op.getConstantOperandVal(1);
6470 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
6471 if ((BaseIdx + NumSubElts) != NumSrcElts)
6472 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
6473 if (BaseIdx != 0)
6474 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
6475 return true;
6476 }
6477 }
6478
6479 // Extract constant bits from shuffle node sources.
6480 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
6481 // TODO - support shuffle through bitcasts.
6482 if (EltSizeInBits != VT.getScalarSizeInBits())
6483 return false;
6484
6485 ArrayRef<int> Mask = SVN->getMask();
6486 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
6487 llvm::any_of(Mask, [](int M) { return M < 0; }))
6488 return false;
6489
6490 APInt UndefElts0, UndefElts1;
6491 SmallVector<APInt, 32> EltBits0, EltBits1;
6492 if (isAnyInRange(Mask, 0, NumElts) &&
6493 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
6494 UndefElts0, EltBits0, AllowWholeUndefs,
6495 AllowPartialUndefs))
6496 return false;
6497 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
6498 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
6499 UndefElts1, EltBits1, AllowWholeUndefs,
6500 AllowPartialUndefs))
6501 return false;
6502
6503 UndefElts = APInt::getNullValue(NumElts);
6504 for (int i = 0; i != (int)NumElts; ++i) {
6505 int M = Mask[i];
6506 if (M < 0) {
6507 UndefElts.setBit(i);
6508 EltBits.push_back(APInt::getNullValue(EltSizeInBits));
6509 } else if (M < (int)NumElts) {
6510 if (UndefElts0[M])
6511 UndefElts.setBit(i);
6512 EltBits.push_back(EltBits0[M]);
6513 } else {
6514 if (UndefElts1[M - NumElts])
6515 UndefElts.setBit(i);
6516 EltBits.push_back(EltBits1[M - NumElts]);
6517 }
6518 }
6519 return true;
6520 }
6521
6522 return false;
6523}
6524
6525namespace llvm {
6526namespace X86 {
6527bool isConstantSplat(SDValue Op, APInt &SplatVal) {
6528 APInt UndefElts;
6529 SmallVector<APInt, 16> EltBits;
6530 if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
6531 UndefElts, EltBits, true, false)) {
6532 int SplatIndex = -1;
6533 for (int i = 0, e = EltBits.size(); i != e; ++i) {
6534 if (UndefElts[i])
6535 continue;
6536 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
6537 SplatIndex = -1;
6538 break;
6539 }
6540 SplatIndex = i;
6541 }
6542 if (0 <= SplatIndex) {
6543 SplatVal = EltBits[SplatIndex];
6544 return true;
6545 }
6546 }
6547
6548 return false;
6549}
6550} // namespace X86
6551} // namespace llvm
6552
6553static bool getTargetShuffleMaskIndices(SDValue MaskNode,
6554 unsigned MaskEltSizeInBits,
6555 SmallVectorImpl<uint64_t> &RawMask,
6556 APInt &UndefElts) {
6557 // Extract the raw target constant bits.
6558 SmallVector<APInt, 64> EltBits;
6559 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
6560 EltBits, /* AllowWholeUndefs */ true,
6561 /* AllowPartialUndefs */ false))
6562 return false;
6563
6564 // Insert the extracted elements into the mask.
6565 for (APInt Elt : EltBits)
6566 RawMask.push_back(Elt.getZExtValue());
6567
6568 return true;
6569}
6570
6571/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
6572/// Note: This ignores saturation, so inputs must be checked first.
6573static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
6574 bool Unary) {
6575 assert(Mask.empty() && "Expected an empty shuffle mask vector")((Mask.empty() && "Expected an empty shuffle mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6575, __PRETTY_FUNCTION__))
;
6576 unsigned NumElts = VT.getVectorNumElements();
6577 unsigned NumLanes = VT.getSizeInBits() / 128;
6578 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
6579 unsigned Offset = Unary ? 0 : NumElts;
6580
6581 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
6582 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
6583 Mask.push_back(Elt + (Lane * NumEltsPerLane));
6584 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
6585 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
6586 }
6587}
6588
6589// Split the demanded elts of a PACKSS/PACKUS node between its operands.
6590static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
6591 APInt &DemandedLHS, APInt &DemandedRHS) {
6592 int NumLanes = VT.getSizeInBits() / 128;
6593 int NumElts = DemandedElts.getBitWidth();
6594 int NumInnerElts = NumElts / 2;
6595 int NumEltsPerLane = NumElts / NumLanes;
6596 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
6597
6598 DemandedLHS = APInt::getNullValue(NumInnerElts);
6599 DemandedRHS = APInt::getNullValue(NumInnerElts);
6600
6601 // Map DemandedElts to the packed operands.
6602 for (int Lane = 0; Lane != NumLanes; ++Lane) {
6603 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
6604 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
6605 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
6606 if (DemandedElts[OuterIdx])
6607 DemandedLHS.setBit(InnerIdx);
6608 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
6609 DemandedRHS.setBit(InnerIdx);
6610 }
6611 }
6612}
6613
6614// Split the demanded elts of a HADD/HSUB node between its operands.
6615static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
6616 APInt &DemandedLHS, APInt &DemandedRHS) {
6617 int NumLanes = VT.getSizeInBits() / 128;
6618 int NumElts = DemandedElts.getBitWidth();
6619 int NumEltsPerLane = NumElts / NumLanes;
6620 int HalfEltsPerLane = NumEltsPerLane / 2;
6621
6622 DemandedLHS = APInt::getNullValue(NumElts);
6623 DemandedRHS = APInt::getNullValue(NumElts);
6624
6625 // Map DemandedElts to the horizontal operands.
6626 for (int Idx = 0; Idx != NumElts; ++Idx) {
6627 if (!DemandedElts[Idx])
6628 continue;
6629 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
6630 int LocalIdx = Idx % NumEltsPerLane;
6631 if (LocalIdx < HalfEltsPerLane) {
6632 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6633 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6634 } else {
6635 LocalIdx -= HalfEltsPerLane;
6636 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
6637 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
6638 }
6639 }
6640}
6641
6642/// Calculates the shuffle mask corresponding to the target-specific opcode.
6643/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
6644/// operands in \p Ops, and returns true.
6645/// Sets \p IsUnary to true if only one source is used. Note that this will set
6646/// IsUnary for shuffles which use a single input multiple times, and in those
6647/// cases it will adjust the mask to only have indices within that single input.
6648/// It is an error to call this with non-empty Mask/Ops vectors.
6649static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
6650 SmallVectorImpl<SDValue> &Ops,
6651 SmallVectorImpl<int> &Mask, bool &IsUnary) {
6652 unsigned NumElems = VT.getVectorNumElements();
6653 unsigned MaskEltSize = VT.getScalarSizeInBits();
6654 SmallVector<uint64_t, 32> RawMask;
6655 APInt RawUndefs;
6656 uint64_t ImmN;
6657
6658 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")((Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? static_cast<void> (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6658, __PRETTY_FUNCTION__))
;
6659 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")((Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? static_cast<void> (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6659, __PRETTY_FUNCTION__))
;
6660
6661 IsUnary = false;
6662 bool IsFakeUnary = false;
6663 switch (N->getOpcode()) {
6664 case X86ISD::BLENDI:
6665 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6665, __PRETTY_FUNCTION__))
;
6666 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6666, __PRETTY_FUNCTION__))
;
6667 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6668 DecodeBLENDMask(NumElems, ImmN, Mask);
6669 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6670 break;
6671 case X86ISD::SHUFP:
6672 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6672, __PRETTY_FUNCTION__))
;
6673 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6673, __PRETTY_FUNCTION__))
;
6674 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6675 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
6676 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6677 break;
6678 case X86ISD::INSERTPS:
6679 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6679, __PRETTY_FUNCTION__))
;
6680 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6680, __PRETTY_FUNCTION__))
;
6681 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6682 DecodeINSERTPSMask(ImmN, Mask);
6683 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6684 break;
6685 case X86ISD::EXTRQI:
6686 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6686, __PRETTY_FUNCTION__))
;
6687 if (isa<ConstantSDNode>(N->getOperand(1)) &&
6688 isa<ConstantSDNode>(N->getOperand(2))) {
6689 int BitLen = N->getConstantOperandVal(1);
6690 int BitIdx = N->getConstantOperandVal(2);
6691 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
6692 IsUnary = true;
6693 }
6694 break;
6695 case X86ISD::INSERTQI:
6696 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6696, __PRETTY_FUNCTION__))
;
6697 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6697, __PRETTY_FUNCTION__))
;
6698 if (isa<ConstantSDNode>(N->getOperand(2)) &&
6699 isa<ConstantSDNode>(N->getOperand(3))) {
6700 int BitLen = N->getConstantOperandVal(2);
6701 int BitIdx = N->getConstantOperandVal(3);
6702 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
6703 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6704 }
6705 break;
6706 case X86ISD::UNPCKH:
6707 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6707, __PRETTY_FUNCTION__))
;
6708 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6708, __PRETTY_FUNCTION__))
;
6709 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
6710 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6711 break;
6712 case X86ISD::UNPCKL:
6713 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6713, __PRETTY_FUNCTION__))
;
6714 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6714, __PRETTY_FUNCTION__))
;
6715 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
6716 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6717 break;
6718 case X86ISD::MOVHLPS:
6719 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6719, __PRETTY_FUNCTION__))
;
6720 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6720, __PRETTY_FUNCTION__))
;
6721 DecodeMOVHLPSMask(NumElems, Mask);
6722 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6723 break;
6724 case X86ISD::MOVLHPS:
6725 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6725, __PRETTY_FUNCTION__))
;
6726 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6726, __PRETTY_FUNCTION__))
;
6727 DecodeMOVLHPSMask(NumElems, Mask);
6728 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6729 break;
6730 case X86ISD::PALIGNR:
6731 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6731, __PRETTY_FUNCTION__))
;
6732 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6732, __PRETTY_FUNCTION__))
;
6733 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6733, __PRETTY_FUNCTION__))
;
6734 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6735 DecodePALIGNRMask(NumElems, ImmN, Mask);
6736 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6737 Ops.push_back(N->getOperand(1));
6738 Ops.push_back(N->getOperand(0));
6739 break;
6740 case X86ISD::VSHLDQ:
6741 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6741, __PRETTY_FUNCTION__))
;
6742 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6742, __PRETTY_FUNCTION__))
;
6743 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6744 DecodePSLLDQMask(NumElems, ImmN, Mask);
6745 IsUnary = true;
6746 break;
6747 case X86ISD::VSRLDQ:
6748 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6748, __PRETTY_FUNCTION__))
;
6749 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6749, __PRETTY_FUNCTION__))
;
6750 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6751 DecodePSRLDQMask(NumElems, ImmN, Mask);
6752 IsUnary = true;
6753 break;
6754 case X86ISD::PSHUFD:
6755 case X86ISD::VPERMILPI:
6756 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6756, __PRETTY_FUNCTION__))
;
6757 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6758 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
6759 IsUnary = true;
6760 break;
6761 case X86ISD::PSHUFHW:
6762 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6762, __PRETTY_FUNCTION__))
;
6763 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6764 DecodePSHUFHWMask(NumElems, ImmN, Mask);
6765 IsUnary = true;
6766 break;
6767 case X86ISD::PSHUFLW:
6768 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6768, __PRETTY_FUNCTION__))
;
6769 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6770 DecodePSHUFLWMask(NumElems, ImmN, Mask);
6771 IsUnary = true;
6772 break;
6773 case X86ISD::VZEXT_MOVL:
6774 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6774, __PRETTY_FUNCTION__))
;
6775 DecodeZeroMoveLowMask(NumElems, Mask);
6776 IsUnary = true;
6777 break;
6778 case X86ISD::VBROADCAST: {
6779 SDValue N0 = N->getOperand(0);
6780 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
6781 // add the pre-extracted value to the Ops vector.
6782 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6783 N0.getOperand(0).getValueType() == VT &&
6784 N0.getConstantOperandVal(1) == 0)
6785 Ops.push_back(N0.getOperand(0));
6786
6787 // We only decode broadcasts of same-sized vectors, unless the broadcast
6788 // came from an extract from the original width. If we found one, we
6789 // pushed it the Ops vector above.
6790 if (N0.getValueType() == VT || !Ops.empty()) {
6791 DecodeVectorBroadcast(NumElems, Mask);
6792 IsUnary = true;
6793 break;
6794 }
6795 return false;
6796 }
6797 case X86ISD::VPERMILPV: {
6798 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6798, __PRETTY_FUNCTION__))
;
6799 IsUnary = true;
6800 SDValue MaskNode = N->getOperand(1);
6801 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6802 RawUndefs)) {
6803 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
6804 break;
6805 }
6806 return false;
6807 }
6808 case X86ISD::PSHUFB: {
6809 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")((VT.getScalarType() == MVT::i8 && "Byte vector expected"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6809, __PRETTY_FUNCTION__))
;
6810 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6810, __PRETTY_FUNCTION__))
;
6811 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6811, __PRETTY_FUNCTION__))
;
6812 IsUnary = true;
6813 SDValue MaskNode = N->getOperand(1);
6814 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6815 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
6816 break;
6817 }
6818 return false;
6819 }
6820 case X86ISD::VPERMI:
6821 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6821, __PRETTY_FUNCTION__))
;
6822 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6823 DecodeVPERMMask(NumElems, ImmN, Mask);
6824 IsUnary = true;
6825 break;
6826 case X86ISD::MOVSS:
6827 case X86ISD::MOVSD:
6828 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6828, __PRETTY_FUNCTION__))
;
6829 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6829, __PRETTY_FUNCTION__))
;
6830 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
6831 break;
6832 case X86ISD::VPERM2X128:
6833 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6833, __PRETTY_FUNCTION__))
;
6834 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6834, __PRETTY_FUNCTION__))
;
6835 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6836 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
6837 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6838 break;
6839 case X86ISD::SHUF128:
6840 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6840, __PRETTY_FUNCTION__))
;
6841 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6841, __PRETTY_FUNCTION__))
;
6842 ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
6843 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
6844 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6845 break;
6846 case X86ISD::MOVSLDUP:
6847 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6847, __PRETTY_FUNCTION__))
;
6848 DecodeMOVSLDUPMask(NumElems, Mask);
6849 IsUnary = true;
6850 break;
6851 case X86ISD::MOVSHDUP:
6852 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6852, __PRETTY_FUNCTION__))
;
6853 DecodeMOVSHDUPMask(NumElems, Mask);
6854 IsUnary = true;
6855 break;
6856 case X86ISD::MOVDDUP:
6857 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6857, __PRETTY_FUNCTION__))
;
6858 DecodeMOVDDUPMask(NumElems, Mask);
6859 IsUnary = true;
6860 break;
6861 case X86ISD::VPERMIL2: {
6862 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6862, __PRETTY_FUNCTION__))
;
6863 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6863, __PRETTY_FUNCTION__))
;
6864 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6865 SDValue MaskNode = N->getOperand(2);
6866 SDValue CtrlNode = N->getOperand(3);
6867 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
6868 unsigned CtrlImm = CtrlOp->getZExtValue();
6869 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6870 RawUndefs)) {
6871 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
6872 Mask);
6873 break;
6874 }
6875 }
6876 return false;
6877 }
6878 case X86ISD::VPPERM: {
6879 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6879, __PRETTY_FUNCTION__))
;
6880 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6880, __PRETTY_FUNCTION__))
;
6881 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
6882 SDValue MaskNode = N->getOperand(2);
6883 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
6884 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
6885 break;
6886 }
6887 return false;
6888 }
6889 case X86ISD::VPERMV: {
6890 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")((N->getOperand(1).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6890, __PRETTY_FUNCTION__))
;
6891 IsUnary = true;
6892 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
6893 Ops.push_back(N->getOperand(1));
6894 SDValue MaskNode = N->getOperand(0);
6895 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6896 RawUndefs)) {
6897 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
6898 break;
6899 }
6900 return false;
6901 }
6902 case X86ISD::VPERMV3: {
6903 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")((N->getOperand(0).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6903, __PRETTY_FUNCTION__))
;
6904 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")((N->getOperand(2).getValueType() == VT && "Unexpected value type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6904, __PRETTY_FUNCTION__))
;
6905 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
6906 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6907 Ops.push_back(N->getOperand(0));
6908 Ops.push_back(N->getOperand(2));
6909 SDValue MaskNode = N->getOperand(1);
6910 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
6911 RawUndefs)) {
6912 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
6913 break;
6914 }
6915 return false;
6916 }
6917 default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6917)
;
6918 }
6919
6920 // Empty mask indicates the decode failed.
6921 if (Mask.empty())
6922 return false;
6923
6924 // Check if we're getting a shuffle mask with zero'd elements.
6925 if (!AllowSentinelZero)
6926 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
6927 return false;
6928
6929 // If we have a fake unary shuffle, the shuffle mask is spread across two
6930 // inputs that are actually the same node. Re-map the mask to always point
6931 // into the first input.
6932 if (IsFakeUnary)
6933 for (int &M : Mask)
6934 if (M >= (int)Mask.size())
6935 M -= Mask.size();
6936
6937 // If we didn't already add operands in the opcode-specific code, default to
6938 // adding 1 or 2 operands starting at 0.
6939 if (Ops.empty()) {
6940 Ops.push_back(N->getOperand(0));
6941 if (!IsUnary || IsFakeUnary)
6942 Ops.push_back(N->getOperand(1));
6943 }
6944
6945 return true;
6946}
6947
6948/// Compute whether each element of a shuffle is zeroable.
6949///
6950/// A "zeroable" vector shuffle element is one which can be lowered to zero.
6951/// Either it is an undef element in the shuffle mask, the element of the input
6952/// referenced is undef, or the element of the input referenced is known to be
6953/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
6954/// as many lanes with this technique as possible to simplify the remaining
6955/// shuffle.
6956static void computeZeroableShuffleElements(ArrayRef<int> Mask,
6957 SDValue V1, SDValue V2,
6958 APInt &KnownUndef, APInt &KnownZero) {
6959 int Size = Mask.size();
6960 KnownUndef = KnownZero = APInt::getNullValue(Size);
6961
6962 V1 = peekThroughBitcasts(V1);
6963 V2 = peekThroughBitcasts(V2);
6964
6965 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
6966 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
6967
6968 int VectorSizeInBits = V1.getValueSizeInBits();
6969 int ScalarSizeInBits = VectorSizeInBits / Size;
6970 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")((!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"
) ? static_cast<void> (0) : __assert_fail ("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 6970, __PRETTY_FUNCTION__))
;
6971
6972 for (int i = 0; i < Size; ++i) {
6973 int M = Mask[i];
6974 // Handle the easy cases.
6975 if (M < 0) {
6976 KnownUndef.setBit(i);
6977 continue;
6978 }
6979 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
6980 KnownZero.setBit(i);
6981 continue;
6982 }
6983
6984 // Determine shuffle input and normalize the mask.
6985 SDValue V = M < Size ? V1 : V2;
6986 M %= Size;
6987
6988 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
6989 if (V.getOpcode() != ISD::BUILD_VECTOR)
6990 continue;
6991
6992 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
6993 // the (larger) source element must be UNDEF/ZERO.
6994 if ((Size % V.getNumOperands()) == 0) {
6995 int Scale = Size / V->getNumOperands();
6996 SDValue Op = V.getOperand(M / Scale);
6997 if (Op.isUndef())
6998 KnownUndef.setBit(i);
6999 if (X86::isZeroNode(Op))
7000 KnownZero.setBit(i);
7001 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7002 APInt Val = Cst->getAPIntValue();
7003 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7004 if (Val == 0)
7005 KnownZero.setBit(i);
7006 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7007 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7008 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
7009 if (Val == 0)
7010 KnownZero.setBit(i);
7011 }
7012 continue;
7013 }
7014
7015 // If the BUILD_VECTOR has more elements then all the (smaller) source
7016 // elements must be UNDEF or ZERO.
7017 if ((V.getNumOperands() % Size) == 0) {
7018 int Scale = V->getNumOperands() / Size;
7019 bool AllUndef = true;
7020 bool AllZero = true;
7021 for (int j = 0; j < Scale; ++j) {
7022 SDValue Op = V.getOperand((M * Scale) + j);
7023 AllUndef &= Op.isUndef();
7024 AllZero &= X86::isZeroNode(Op);
7025 }
7026 if (AllUndef)
7027 KnownUndef.setBit(i);
7028 if (AllZero)
7029 KnownZero.setBit(i);
7030 continue;
7031 }
7032 }
7033}
7034
7035/// Decode a target shuffle mask and inputs and see if any values are
7036/// known to be undef or zero from their inputs.
7037/// Returns true if the target shuffle mask was decoded.
7038/// FIXME: Merge this with computeZeroableShuffleElements?
7039static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
7040 SmallVectorImpl<SDValue> &Ops,
7041 APInt &KnownUndef, APInt &KnownZero) {
7042 bool IsUnary;
7043 if (!isTargetShuffle(N.getOpcode()))
7044 return false;
7045
7046 MVT VT = N.getSimpleValueType();
7047 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
7048 return false;
7049
7050 int Size = Mask.size();
7051 SDValue V1 = Ops[0];
7052 SDValue V2 = IsUnary ? V1 : Ops[1];
7053 KnownUndef = KnownZero = APInt::getNullValue(Size);
7054
7055 V1 = peekThroughBitcasts(V1);
7056 V2 = peekThroughBitcasts(V2);
7057
7058 assert((VT.getSizeInBits() % Size) == 0 &&(((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7059, __PRETTY_FUNCTION__))
7059 "Illegal split of shuffle value type")(((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7059, __PRETTY_FUNCTION__))
;
7060 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
7061
7062 // Extract known constant input data.
7063 APInt UndefSrcElts[2];
7064 SmallVector<APInt, 32> SrcEltBits[2];
7065 bool IsSrcConstant[2] = {
7066 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
7067 SrcEltBits[0], true, false),
7068 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
7069 SrcEltBits[1], true, false)};
7070
7071 for (int i = 0; i < Size; ++i) {
7072 int M = Mask[i];
7073
7074 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
7075 if (M < 0) {
7076 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")((isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7076, __PRETTY_FUNCTION__))
;
7077 if (SM_SentinelUndef == M)
7078 KnownUndef.setBit(i);
7079 if (SM_SentinelZero == M)
7080 KnownZero.setBit(i);
7081 continue;
7082 }
7083
7084 // Determine shuffle input and normalize the mask.
7085 unsigned SrcIdx = M / Size;
7086 SDValue V = M < Size ? V1 : V2;
7087 M %= Size;
7088
7089 // We are referencing an UNDEF input.
7090 if (V.isUndef()) {
7091 KnownUndef.setBit(i);
7092 continue;
7093 }
7094
7095 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
7096 // TODO: We currently only set UNDEF for integer types - floats use the same
7097 // registers as vectors and many of the scalar folded loads rely on the
7098 // SCALAR_TO_VECTOR pattern.
7099 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
7100 (Size % V.getValueType().getVectorNumElements()) == 0) {
7101 int Scale = Size / V.getValueType().getVectorNumElements();
7102 int Idx = M / Scale;
7103 if (Idx != 0 && !VT.isFloatingPoint())
7104 KnownUndef.setBit(i);
7105 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
7106 KnownZero.setBit(i);
7107 continue;
7108 }
7109
7110 // Attempt to extract from the source's constant bits.
7111 if (IsSrcConstant[SrcIdx]) {
7112 if (UndefSrcElts[SrcIdx][M])
7113 KnownUndef.setBit(i);
7114 else if (SrcEltBits[SrcIdx][M] == 0)
7115 KnownZero.setBit(i);
7116 }
7117 }
7118
7119 assert(VT.getVectorNumElements() == (unsigned)Size &&((VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7120, __PRETTY_FUNCTION__))
7120 "Different mask size from vector size!")((VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7120, __PRETTY_FUNCTION__))
;
7121 return true;
7122}
7123
7124// Replace target shuffle mask elements with known undef/zero sentinels.
7125static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
7126 const APInt &KnownUndef,
7127 const APInt &KnownZero,
7128 bool ResolveKnownZeros= true) {
7129 unsigned NumElts = Mask.size();
7130 assert(KnownUndef.getBitWidth() == NumElts &&((KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth
() == NumElts && "Shuffle mask size mismatch") ? static_cast
<void> (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7131, __PRETTY_FUNCTION__))
46
Assuming the condition is true
47
Assuming the condition is true
48
'?' condition is true
7131 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")((KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth
() == NumElts && "Shuffle mask size mismatch") ? static_cast
<void> (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7131, __PRETTY_FUNCTION__))
;
7132
7133 for (unsigned i = 0; i != NumElts; ++i) {
49
Assuming 'i' is equal to 'NumElts'
50
Loop condition is false. Execution continues on line 7133
7134 if (KnownUndef[i])
7135 Mask[i] = SM_SentinelUndef;
7136 else if (ResolveKnownZeros && KnownZero[i])
7137 Mask[i] = SM_SentinelZero;
7138 }
7139}
51
Returning without writing to 'Mask.Size'
7140
7141// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
7142static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
7143 APInt &KnownUndef,
7144 APInt &KnownZero) {
7145 unsigned NumElts = Mask.size();
7146 KnownUndef = KnownZero = APInt::getNullValue(NumElts);
7147
7148 for (unsigned i = 0; i != NumElts; ++i) {
7149 int M = Mask[i];
7150 if (SM_SentinelUndef == M)
7151 KnownUndef.setBit(i);
7152 if (SM_SentinelZero == M)
7153 KnownZero.setBit(i);
7154 }
7155}
7156
7157// Forward declaration (for getFauxShuffleMask recursive check).
7158// TODO: Use DemandedElts variant.
7159static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7160 SmallVectorImpl<int> &Mask,
7161 SelectionDAG &DAG, unsigned Depth,
7162 bool ResolveKnownElts);
7163
7164// Attempt to decode ops that could be represented as a shuffle mask.
7165// The decoded shuffle mask may contain a different number of elements to the
7166// destination value type.
7167static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
7168 SmallVectorImpl<int> &Mask,
7169 SmallVectorImpl<SDValue> &Ops,
7170 SelectionDAG &DAG, unsigned Depth,
7171 bool ResolveKnownElts) {
7172 Mask.clear();
7173 Ops.clear();
7174
7175 MVT VT = N.getSimpleValueType();
7176 unsigned NumElts = VT.getVectorNumElements();
7177 unsigned NumSizeInBits = VT.getSizeInBits();
7178 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
7179 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
7180 return false;
7181 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")((NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"
) ? static_cast<void> (0) : __assert_fail ("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7181, __PRETTY_FUNCTION__))
;
7182
7183 unsigned Opcode = N.getOpcode();
7184 switch (Opcode) {
7185 case ISD::VECTOR_SHUFFLE: {
7186 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
7187 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
7188 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
7189 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
7190 Ops.push_back(N.getOperand(0));
7191 Ops.push_back(N.getOperand(1));
7192 return true;
7193 }
7194 return false;
7195 }
7196 case ISD::AND:
7197 case X86ISD::ANDNP: {
7198 // Attempt to decode as a per-byte mask.
7199 APInt UndefElts;
7200 SmallVector<APInt, 32> EltBits;
7201 SDValue N0 = N.getOperand(0);
7202 SDValue N1 = N.getOperand(1);
7203 bool IsAndN = (X86ISD::ANDNP == Opcode);
7204 uint64_t ZeroMask = IsAndN ? 255 : 0;
7205 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
7206 return false;
7207 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
7208 if (UndefElts[i]) {
7209 Mask.push_back(SM_SentinelUndef);
7210 continue;
7211 }
7212 const APInt &ByteBits = EltBits[i];
7213 if (ByteBits != 0 && ByteBits != 255)
7214 return false;
7215 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
7216 }
7217 Ops.push_back(IsAndN ? N1 : N0);
7218 return true;
7219 }
7220 case ISD::OR: {
7221 // Inspect each operand at the byte level. We can merge these into a
7222 // blend shuffle mask if for each byte at least one is masked out (zero).
7223 KnownBits Known0 =
7224 DAG.computeKnownBits(N.getOperand(0), DemandedElts, Depth + 1);
7225 KnownBits Known1 =
7226 DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
7227 if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
7228 bool IsByteMask = true;
7229 unsigned NumSizeInBytes = NumSizeInBits / 8;
7230 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7231 APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
7232 APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
7233 for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
7234 unsigned LHS = Known0.Zero.extractBits(8, i * 8).getZExtValue();
7235 unsigned RHS = Known1.Zero.extractBits(8, i * 8).getZExtValue();
7236 if (LHS == 255 && RHS == 0)
7237 SelectMask.setBit(i);
7238 else if (LHS == 255 && RHS == 255)
7239 ZeroMask.setBit(i);
7240 else if (!(LHS == 0 && RHS == 255))
7241 IsByteMask = false;
7242 }
7243 if (IsByteMask) {
7244 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) {
7245 for (unsigned j = 0; j != NumBytesPerElt; ++j) {
7246 unsigned Ofs = (SelectMask[j] ? NumSizeInBytes : 0);
7247 int Idx = (ZeroMask[j] ? (int)SM_SentinelZero : (i + j + Ofs));
7248 Mask.push_back(Idx);
7249 }
7250 }
7251 Ops.push_back(N.getOperand(0));
7252 Ops.push_back(N.getOperand(1));
7253 return true;
7254 }
7255 }
7256
7257 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
7258 // is a valid shuffle index.
7259 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
7260 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
7261 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
7262 return false;
7263 SmallVector<int, 64> SrcMask0, SrcMask1;
7264 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
7265 if (!getTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG, Depth + 1,
7266 true) ||
7267 !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
7268 true))
7269 return false;
7270
7271 // Shuffle inputs must be the same size as the result.
7272 if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
7273 return VT.getSizeInBits() != Op.getValueSizeInBits();
7274 }))
7275 return false;
7276 if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
7277 return VT.getSizeInBits() != Op.getValueSizeInBits();
7278 }))
7279 return false;
7280
7281 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
7282 SmallVector<int, 64> Mask0, Mask1;
7283 scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
7284 scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
7285 for (size_t i = 0; i != MaskSize; ++i) {
7286 if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
7287 Mask.push_back(SM_SentinelUndef);
7288 else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
7289 Mask.push_back(SM_SentinelZero);
7290 else if (Mask1[i] == SM_SentinelZero)
7291 Mask.push_back(Mask0[i]);
7292 else if (Mask0[i] == SM_SentinelZero)
7293 Mask.push_back(Mask1[i] + (int)(MaskSize * SrcInputs0.size()));
7294 else
7295 return false;
7296 }
7297 Ops.append(SrcInputs0.begin(), SrcInputs0.end());
7298 Ops.append(SrcInputs1.begin(), SrcInputs1.end());
7299 return true;
7300 }
7301 case ISD::INSERT_SUBVECTOR: {
7302 SDValue Src = N.getOperand(0);
7303 SDValue Sub = N.getOperand(1);
7304 EVT SubVT = Sub.getValueType();
7305 unsigned NumSubElts = SubVT.getVectorNumElements();
7306 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
7307 !N->isOnlyUserOf(Sub.getNode()))
7308 return false;
7309 uint64_t InsertIdx = N.getConstantOperandVal(2);
7310 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
7311 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
7312 Sub.getOperand(0).getValueType() == VT &&
7313 isa<ConstantSDNode>(Sub.getOperand(1))) {
7314 uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
7315 for (int i = 0; i != (int)NumElts; ++i)
7316 Mask.push_back(i);
7317 for (int i = 0; i != (int)NumSubElts; ++i)
7318 Mask[InsertIdx + i] = NumElts + ExtractIdx + i;
7319 Ops.push_back(Src);
7320 Ops.push_back(Sub.getOperand(0));
7321 return true;
7322 }
7323 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
7324 SmallVector<int, 64> SubMask;
7325 SmallVector<SDValue, 2> SubInputs;
7326 if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
7327 SubMask, DAG, Depth + 1, ResolveKnownElts))
7328 return false;
7329
7330 // Shuffle inputs must be the same size as the subvector.
7331 if (llvm::any_of(SubInputs, [SubVT](SDValue Op) {
7332 return SubVT.getSizeInBits() != Op.getValueSizeInBits();
7333 }))
7334 return false;
7335
7336 if (SubMask.size() != NumSubElts) {
7337 assert(((SubMask.size() % NumSubElts) == 0 ||((((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask
.size()) == 0) && "Illegal submask scale") ? static_cast
<void> (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7338, __PRETTY_FUNCTION__))
7338 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")((((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask
.size()) == 0) && "Illegal submask scale") ? static_cast
<void> (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7338, __PRETTY_FUNCTION__))
;
7339 if ((NumSubElts % SubMask.size()) == 0) {
7340 int Scale = NumSubElts / SubMask.size();
7341 SmallVector<int,64> ScaledSubMask;
7342 scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
7343 SubMask = ScaledSubMask;
7344 } else {
7345 int Scale = SubMask.size() / NumSubElts;
7346 NumSubElts = SubMask.size();
7347 NumElts *= Scale;
7348 InsertIdx *= Scale;
7349 }
7350 }
7351 Ops.push_back(Src);
7352 for (SDValue &SubInput : SubInputs) {
7353 EVT SubSVT = SubInput.getValueType().getScalarType();
7354 EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
7355 NumSizeInBits / SubSVT.getSizeInBits());
7356 Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
7357 DAG.getUNDEF(AltVT), SubInput,
7358 DAG.getIntPtrConstant(0, SDLoc(N))));
7359 }
7360 for (int i = 0; i != (int)NumElts; ++i)
7361 Mask.push_back(i);
7362 for (int i = 0; i != (int)NumSubElts; ++i) {
7363 int M = SubMask[i];
7364 if (0 <= M) {
7365 int InputIdx = M / NumSubElts;
7366 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
7367 }
7368 Mask[i + InsertIdx] = M;
7369 }
7370 return true;
7371 }
7372 case ISD::SCALAR_TO_VECTOR: {
7373 // Match against a scalar_to_vector of an extract from a vector,
7374 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
7375 SDValue N0 = N.getOperand(0);
7376 SDValue SrcExtract;
7377
7378 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7379 N0.getOperand(0).getValueType() == VT) ||
7380 (N0.getOpcode() == X86ISD::PEXTRW &&
7381 N0.getOperand(0).getValueType() == MVT::v8i16) ||
7382 (N0.getOpcode() == X86ISD::PEXTRB &&
7383 N0.getOperand(0).getValueType() == MVT::v16i8)) {
7384 SrcExtract = N0;
7385 }
7386
7387 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
7388 return false;
7389
7390 SDValue SrcVec = SrcExtract.getOperand(0);
7391 EVT SrcVT = SrcVec.getValueType();
7392 unsigned NumSrcElts = SrcVT.getVectorNumElements();
7393 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
7394
7395 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
7396 if (NumSrcElts <= SrcIdx)
7397 return false;
7398
7399 Ops.push_back(SrcVec);
7400 Mask.push_back(SrcIdx);
7401 Mask.append(NumZeros, SM_SentinelZero);
7402 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
7403 return true;
7404 }
7405 case X86ISD::PINSRB:
7406 case X86ISD::PINSRW: {
7407 SDValue InVec = N.getOperand(0);
7408 SDValue InScl = N.getOperand(1);
7409 SDValue InIndex = N.getOperand(2);
7410 if (!isa<ConstantSDNode>(InIndex) ||
7411 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
7412 return false;
7413 uint64_t InIdx = N.getConstantOperandVal(2);
7414
7415 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
7416 if (X86::isZeroNode(InScl)) {
7417 Ops.push_back(InVec);
7418 for (unsigned i = 0; i != NumElts; ++i)
7419 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
7420 return true;
7421 }
7422
7423 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
7424 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
7425 unsigned ExOp =
7426 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
7427 if (InScl.getOpcode() != ExOp)
7428 return false;
7429
7430 SDValue ExVec = InScl.getOperand(0);
7431 SDValue ExIndex = InScl.getOperand(1);
7432 if (!isa<ConstantSDNode>(ExIndex) ||
7433 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
7434 return false;
7435 uint64_t ExIdx = InScl.getConstantOperandVal(1);
7436
7437 Ops.push_back(InVec);
7438 Ops.push_back(ExVec);
7439 for (unsigned i = 0; i != NumElts; ++i)
7440 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
7441 return true;
7442 }
7443 case X86ISD::PACKSS:
7444 case X86ISD::PACKUS: {
7445 SDValue N0 = N.getOperand(0);
7446 SDValue N1 = N.getOperand(1);
7447 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7449, __PRETTY_FUNCTION__))
7448 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7449, __PRETTY_FUNCTION__))
7449 "Unexpected input value type")((N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
"Unexpected input value type") ? static_cast<void> (0)
: __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7449, __PRETTY_FUNCTION__))
;
7450
7451 APInt EltsLHS, EltsRHS;
7452 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
7453
7454 // If we know input saturation won't happen we can treat this
7455 // as a truncation shuffle.
7456 if (Opcode == X86ISD::PACKSS) {
7457 if ((!N0.isUndef() &&
7458 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
7459 (!N1.isUndef() &&
7460 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
7461 return false;
7462 } else {
7463 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
7464 if ((!N0.isUndef() &&
7465 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
7466 (!N1.isUndef() &&
7467 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
7468 return false;
7469 }
7470
7471 bool IsUnary = (N0 == N1);
7472
7473 Ops.push_back(N0);
7474 if (!IsUnary)
7475 Ops.push_back(N1);
7476
7477 createPackShuffleMask(VT, Mask, IsUnary);
7478 return true;
7479 }
7480 case X86ISD::VSHLI:
7481 case X86ISD::VSRLI: {
7482 uint64_t ShiftVal = N.getConstantOperandVal(1);
7483 // Out of range bit shifts are guaranteed to be zero.
7484 if (NumBitsPerElt <= ShiftVal) {
7485 Mask.append(NumElts, SM_SentinelZero);
7486 return true;
7487 }
7488
7489 // We can only decode 'whole byte' bit shifts as shuffles.
7490 if ((ShiftVal % 8) != 0)
7491 break;
7492
7493 uint64_t ByteShift = ShiftVal / 8;
7494 unsigned NumBytes = NumSizeInBits / 8;
7495 unsigned NumBytesPerElt = NumBitsPerElt / 8;
7496 Ops.push_back(N.getOperand(0));
7497
7498 // Clear mask to all zeros and insert the shifted byte indices.
7499 Mask.append(NumBytes, SM_SentinelZero);
7500
7501 if (X86ISD::VSHLI == Opcode) {
7502 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
7503 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7504 Mask[i + j] = i + j - ByteShift;
7505 } else {
7506 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
7507 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
7508 Mask[i + j - ByteShift] = i + j;
7509 }
7510 return true;
7511 }
7512 case X86ISD::VROTLI:
7513 case X86ISD::VROTRI: {
7514 // We can only decode 'whole byte' bit rotates as shuffles.
7515 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
7516 if ((RotateVal % 8) != 0)
7517 return false;
7518 Ops.push_back(N.getOperand(0));
7519 int NumBytesPerElt = NumBitsPerElt / 8;
7520 int Offset = RotateVal / 8;
7521 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
7522 for (int i = 0; i != (int)NumElts; ++i) {
7523 int BaseIdx = i * NumBytesPerElt;
7524 for (int j = 0; j != NumBytesPerElt; ++j) {
7525 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
7526 }
7527 }
7528 return true;
7529 }
7530 case X86ISD::VBROADCAST: {
7531 SDValue Src = N.getOperand(0);
7532 MVT SrcVT = Src.getSimpleValueType();
7533 if (!SrcVT.isVector())
7534 return false;
7535
7536 if (NumSizeInBits != SrcVT.getSizeInBits()) {
7537 assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal broadcast type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal broadcast type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7538, __PRETTY_FUNCTION__))
7538 "Illegal broadcast type")(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal broadcast type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal broadcast type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7538, __PRETTY_FUNCTION__))
;
7539 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
7540 NumSizeInBits / SrcVT.getScalarSizeInBits());
7541 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
7542 DAG.getUNDEF(SrcVT), Src,
7543 DAG.getIntPtrConstant(0, SDLoc(N)));
7544 }
7545
7546 Ops.push_back(Src);
7547 Mask.append(NumElts, 0);
7548 return true;
7549 }
7550 case ISD::ZERO_EXTEND:
7551 case ISD::ANY_EXTEND:
7552 case ISD::ZERO_EXTEND_VECTOR_INREG:
7553 case ISD::ANY_EXTEND_VECTOR_INREG: {
7554 SDValue Src = N.getOperand(0);
7555 EVT SrcVT = Src.getValueType();
7556
7557 // Extended source must be a simple vector.
7558 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
7559 (SrcVT.getScalarSizeInBits() % 8) != 0)
7560 return false;
7561
7562 unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
7563 bool IsAnyExtend =
7564 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
7565 DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
7566 Mask);
7567
7568 if (NumSizeInBits != SrcVT.getSizeInBits()) {
7569 assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal zero-extension type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal zero-extension type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7570, __PRETTY_FUNCTION__))
7570 "Illegal zero-extension type")(((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && "Illegal zero-extension type"
) ? static_cast<void> (0) : __assert_fail ("(NumSizeInBits % SrcVT.getSizeInBits()) == 0 && \"Illegal zero-extension type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7570, __PRETTY_FUNCTION__))
;
7571 SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
7572 NumSizeInBits / NumSrcBitsPerElt);
7573 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
7574 DAG.getUNDEF(SrcVT), Src,
7575 DAG.getIntPtrConstant(0, SDLoc(N)));
7576 }
7577
7578 Ops.push_back(Src);
7579 return true;
7580 }
7581 }
7582
7583 return false;
7584}
7585
7586/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
7587static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
7588 SmallVectorImpl<int> &Mask) {
7589 int MaskWidth = Mask.size();
7590 SmallVector<SDValue, 16> UsedInputs;
7591 for (int i = 0, e = Inputs.size(); i < e; ++i) {
7592 int lo = UsedInputs.size() * MaskWidth;
7593 int hi = lo + MaskWidth;
7594
7595 // Strip UNDEF input usage.
7596 if (Inputs[i].isUndef())
7597 for (int &M : Mask)
7598 if ((lo <= M) && (M < hi))
7599 M = SM_SentinelUndef;
7600
7601 // Check for unused inputs.
7602 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
7603 for (int &M : Mask)
7604 if (lo <= M)
7605 M -= MaskWidth;
7606 continue;
7607 }
7608
7609 // Check for repeated inputs.
7610 bool IsRepeat = false;
7611 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
7612 if (UsedInputs[j] != Inputs[i])
7613 continue;
7614 for (int &M : Mask)
7615 if (lo <= M)
7616 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7617 IsRepeat = true;
7618 break;
7619 }
7620 if (IsRepeat)
7621 continue;
7622
7623 UsedInputs.push_back(Inputs[i]);
7624 }
7625 Inputs = UsedInputs;
7626}
7627
7628/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
7629/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
7630/// Returns true if the target shuffle mask was decoded.
7631static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
7632 SmallVectorImpl<SDValue> &Inputs,
7633 SmallVectorImpl<int> &Mask,
7634 APInt &KnownUndef, APInt &KnownZero,
7635 SelectionDAG &DAG, unsigned Depth,
7636 bool ResolveKnownElts) {
7637 EVT VT = Op.getValueType();
7638 if (!VT.isSimple() || !VT.isVector())
15
Calling 'EVT::isSimple'
17
Returning from 'EVT::isSimple'
18
Calling 'EVT::isVector'
24
Returning from 'EVT::isVector'
25
Taking false branch
7639 return false;
7640
7641 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
26
Value assigned to 'OpMask.Size'
27
Assuming the condition is true
28
Taking true branch
7642 if (ResolveKnownElts
28.1
'ResolveKnownElts' is false
28.1
'ResolveKnownElts' is false
28.1
'ResolveKnownElts' is false
28.1
'ResolveKnownElts' is false
28.1
'ResolveKnownElts' is false
28.1
'ResolveKnownElts' is false
)
29
Taking false branch
7643 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
7644 return true;
30
Returning the value 1, which participates in a condition later
7645 }
7646 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
7647 ResolveKnownElts)) {
7648 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
7649 return true;
7650 }
7651 return false;
7652}
7653
7654static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
7655 SmallVectorImpl<int> &Mask,
7656 SelectionDAG &DAG, unsigned Depth = 0,
7657 bool ResolveKnownElts = true) {
7658 EVT VT = Op.getValueType();
7659 if (!VT.isSimple() || !VT.isVector())
7660 return false;
7661
7662 APInt KnownUndef, KnownZero;
7663 unsigned NumElts = Op.getValueType().getVectorNumElements();
7664 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
7665 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
7666 KnownZero, DAG, Depth, ResolveKnownElts);
7667}
7668
7669/// Returns the scalar element that will make up the i'th
7670/// element of the result of the vector shuffle.
7671static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
7672 unsigned Depth) {
7673 if (Depth == 6)
7674 return SDValue(); // Limit search depth.
7675
7676 SDValue V = SDValue(N, 0);
7677 EVT VT = V.getValueType();
7678 unsigned Opcode = V.getOpcode();
7679
7680 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
7681 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
7682 int Elt = SV->getMaskElt(Index);
7683
7684 if (Elt < 0)
7685 return DAG.getUNDEF(VT.getVectorElementType());
7686
7687 unsigned NumElems = VT.getVectorNumElements();
7688 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
7689 : SV->getOperand(1);
7690 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
7691 }
7692
7693 // Recurse into target specific vector shuffles to find scalars.
7694 if (isTargetShuffle(Opcode)) {
7695 MVT ShufVT = V.getSimpleValueType();
7696 MVT ShufSVT = ShufVT.getVectorElementType();
7697 int NumElems = (int)ShufVT.getVectorNumElements();
7698 SmallVector<int, 16> ShuffleMask;
7699 SmallVector<SDValue, 16> ShuffleOps;
7700 bool IsUnary;
7701
7702 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
7703 return SDValue();
7704
7705 int Elt = ShuffleMask[Index];
7706 if (Elt == SM_SentinelZero)
7707 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
7708 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
7709 if (Elt == SM_SentinelUndef)
7710 return DAG.getUNDEF(ShufSVT);
7711
7712 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range")((0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"
) ? static_cast<void> (0) : __assert_fail ("0 <= Elt && Elt < (2*NumElems) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7712, __PRETTY_FUNCTION__))
;
7713 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
7714 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
7715 Depth+1);
7716 }
7717
7718 // Recurse into insert_subvector base/sub vector to find scalars.
7719 if (Opcode == ISD::INSERT_SUBVECTOR &&
7720 isa<ConstantSDNode>(N->getOperand(2))) {
7721 SDValue Vec = N->getOperand(0);
7722 SDValue Sub = N->getOperand(1);
7723 EVT SubVT = Sub.getValueType();
7724 unsigned NumSubElts = SubVT.getVectorNumElements();
7725 uint64_t SubIdx = N->getConstantOperandVal(2);
7726
7727 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
7728 return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
7729 return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
7730 }
7731
7732 // Recurse into extract_subvector src vector to find scalars.
7733 if (Opcode == ISD::EXTRACT_SUBVECTOR &&
7734 isa<ConstantSDNode>(N->getOperand(1))) {
7735 SDValue Src = N->getOperand(0);
7736 uint64_t SrcIdx = N->getConstantOperandVal(1);
7737 return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
7738 }
7739
7740 // Actual nodes that may contain scalar elements
7741 if (Opcode == ISD::BITCAST) {
7742 V = V.getOperand(0);
7743 EVT SrcVT = V.getValueType();
7744 unsigned NumElems = VT.getVectorNumElements();
7745
7746 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
7747 return SDValue();
7748 }
7749
7750 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
7751 return (Index == 0) ? V.getOperand(0)
7752 : DAG.getUNDEF(VT.getVectorElementType());
7753
7754 if (V.getOpcode() == ISD::BUILD_VECTOR)
7755 return V.getOperand(Index);
7756
7757 return SDValue();
7758}
7759
7760// Use PINSRB/PINSRW/PINSRD to create a build vector.
7761static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
7762 unsigned NumNonZero, unsigned NumZero,
7763 SelectionDAG &DAG,
7764 const X86Subtarget &Subtarget) {
7765 MVT VT = Op.getSimpleValueType();
7766 unsigned NumElts = VT.getVectorNumElements();
7767 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7769, __PRETTY_FUNCTION__))
7768 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7769, __PRETTY_FUNCTION__))
7769 "Illegal vector insertion")((((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT ==
MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41
())) && "Illegal vector insertion") ? static_cast<
void> (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7769, __PRETTY_FUNCTION__))
;
7770
7771 SDLoc dl(Op);
7772 SDValue V;
7773 bool First = true;
7774
7775 for (unsigned i = 0; i < NumElts; ++i) {
7776 bool IsNonZero = (NonZeros & (1 << i)) != 0;
7777 if (!IsNonZero)
7778 continue;
7779
7780 // If the build vector contains zeros or our first insertion is not the
7781 // first index then insert into zero vector to break any register
7782 // dependency else use SCALAR_TO_VECTOR.
7783 if (First) {
7784 First = false;
7785 if (NumZero || 0 != i)
7786 V = getZeroVector(VT, Subtarget, DAG, dl);
7787 else {
7788 assert(0 == i && "Expected insertion into zero-index")((0 == i && "Expected insertion into zero-index") ? static_cast
<void> (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7788, __PRETTY_FUNCTION__))
;
7789 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
7790 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
7791 V = DAG.getBitcast(VT, V);
7792 continue;
7793 }
7794 }
7795 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
7796 DAG.getIntPtrConstant(i, dl));
7797 }
7798
7799 return V;
7800}
7801
7802/// Custom lower build_vector of v16i8.
7803static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
7804 unsigned NumNonZero, unsigned NumZero,
7805 SelectionDAG &DAG,
7806 const X86Subtarget &Subtarget) {
7807 if (NumNonZero > 8 && !Subtarget.hasSSE41())
7808 return SDValue();
7809
7810 // SSE4.1 - use PINSRB to insert each byte directly.
7811 if (Subtarget.hasSSE41())
7812 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
7813 Subtarget);
7814
7815 SDLoc dl(Op);
7816 SDValue V;
7817
7818 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
7819 for (unsigned i = 0; i < 16; i += 2) {
7820 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
7821 bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
7822 if (!ThisIsNonZero && !NextIsNonZero)
7823 continue;
7824
7825 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
7826 SDValue Elt;
7827 if (ThisIsNonZero) {
7828 if (NumZero || NextIsNonZero)
7829 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
7830 else
7831 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
7832 }
7833
7834 if (NextIsNonZero) {
7835 SDValue NextElt = Op.getOperand(i + 1);
7836 if (i == 0 && NumZero)
7837 NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);
7838 else
7839 NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);
7840 NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
7841 DAG.getConstant(8, dl, MVT::i8));
7842 if (ThisIsNonZero)
7843 Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
7844 else
7845 Elt = NextElt;
7846 }
7847
7848 // If our first insertion is not the first index then insert into zero
7849 // vector to break any register dependency else use SCALAR_TO_VECTOR.
7850 if (!V) {
7851 if (i != 0)
7852 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
7853 else {
7854 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
7855 V = DAG.getBitcast(MVT::v8i16, V);
7856 continue;
7857 }
7858 }
7859 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
7860 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
7861 DAG.getIntPtrConstant(i / 2, dl));
7862 }
7863
7864 return DAG.getBitcast(MVT::v16i8, V);
7865}
7866
7867/// Custom lower build_vector of v8i16.
7868static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
7869 unsigned NumNonZero, unsigned NumZero,
7870 SelectionDAG &DAG,
7871 const X86Subtarget &Subtarget) {
7872 if (NumNonZero > 4 && !Subtarget.hasSSE41())
7873 return SDValue();
7874
7875 // Use PINSRW to insert each byte directly.
7876 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
7877 Subtarget);
7878}
7879
7880/// Custom lower build_vector of v4i32 or v4f32.
7881static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
7882 const X86Subtarget &Subtarget) {
7883 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7884 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7885 // Because we're creating a less complicated build vector here, we may enable
7886 // further folding of the MOVDDUP via shuffle transforms.
7887 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7888 Op.getOperand(0) == Op.getOperand(2) &&
7889 Op.getOperand(1) == Op.getOperand(3) &&
7890 Op.getOperand(0) != Op.getOperand(1)) {
7891 SDLoc DL(Op);
7892 MVT VT = Op.getSimpleValueType();
7893 MVT EltVT = VT.getVectorElementType();
7894 // Create a new build vector with the first 2 elements followed by undef
7895 // padding, bitcast to v2f64, duplicate, and bitcast back.
7896 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7897 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7898 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7899 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7900 return DAG.getBitcast(VT, Dup);
7901 }
7902
7903 // Find all zeroable elements.
7904 std::bitset<4> Zeroable, Undefs;
7905 for (int i = 0; i < 4; ++i) {
7906 SDValue Elt = Op.getOperand(i);
7907 Undefs[i] = Elt.isUndef();
7908 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7909 }
7910 assert(Zeroable.size() - Zeroable.count() > 1 &&((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7911, __PRETTY_FUNCTION__))
7911 "We expect at least two non-zero elements!")((Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7911, __PRETTY_FUNCTION__))
;
7912
7913 // We only know how to deal with build_vector nodes where elements are either
7914 // zeroable or extract_vector_elt with constant index.
7915 SDValue FirstNonZero;
7916 unsigned FirstNonZeroIdx;
7917 for (unsigned i = 0; i < 4; ++i) {
7918 if (Zeroable[i])
7919 continue;
7920 SDValue Elt = Op.getOperand(i);
7921 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7922 !isa<ConstantSDNode>(Elt.getOperand(1)))
7923 return SDValue();
7924 // Make sure that this node is extracting from a 128-bit vector.
7925 MVT VT = Elt.getOperand(0).getSimpleValueType();
7926 if (!VT.is128BitVector())
7927 return SDValue();
7928 if (!FirstNonZero.getNode()) {
7929 FirstNonZero = Elt;
7930 FirstNonZeroIdx = i;
7931 }
7932 }
7933
7934 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")((FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? static_cast<void> (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7934, __PRETTY_FUNCTION__))
;
7935 SDValue V1 = FirstNonZero.getOperand(0);
7936 MVT VT = V1.getSimpleValueType();
7937
7938 // See if this build_vector can be lowered as a blend with zero.
7939 SDValue Elt;
7940 unsigned EltMaskIdx, EltIdx;
7941 int Mask[4];
7942 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7943 if (Zeroable[EltIdx]) {
7944 // The zero vector will be on the right hand side.
7945 Mask[EltIdx] = EltIdx+4;
7946 continue;
7947 }
7948
7949 Elt = Op->getOperand(EltIdx);
7950 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7951 EltMaskIdx = Elt.getConstantOperandVal(1);
7952 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7953 break;
7954 Mask[EltIdx] = EltIdx;
7955 }
7956
7957 if (EltIdx == 4) {
7958 // Let the shuffle legalizer deal with blend operations.
7959 SDValue VZeroOrUndef = (Zeroable == Undefs)
7960 ? DAG.getUNDEF(VT)
7961 : getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
7962 if (V1.getSimpleValueType() != VT)
7963 V1 = DAG.getBitcast(VT, V1);
7964 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7965 }
7966
7967 // See if we can lower this build_vector to a INSERTPS.
7968 if (!Subtarget.hasSSE41())
7969 return SDValue();
7970
7971 SDValue V2 = Elt.getOperand(0);
7972 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7973 V1 = SDValue();
7974
7975 bool CanFold = true;
7976 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7977 if (Zeroable[i])
7978 continue;
7979
7980 SDValue Current = Op->getOperand(i);
7981 SDValue SrcVector = Current->getOperand(0);
7982 if (!V1.getNode())
7983 V1 = SrcVector;
7984 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7985 }
7986
7987 if (!CanFold)
7988 return SDValue();
7989
7990 assert(V1.getNode() && "Expected at least two non-zero elements!")((V1.getNode() && "Expected at least two non-zero elements!"
) ? static_cast<void> (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 7990, __PRETTY_FUNCTION__))
;
7991 if (V1.getSimpleValueType() != MVT::v4f32)
7992 V1 = DAG.getBitcast(MVT::v4f32, V1);
7993 if (V2.getSimpleValueType() != MVT::v4f32)
7994 V2 = DAG.getBitcast(MVT::v4f32, V2);
7995
7996 // Ok, we can emit an INSERTPS instruction.
7997 unsigned ZMask = Zeroable.to_ulong();
7998
7999 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
8000 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8000, __PRETTY_FUNCTION__))
;
8001 SDLoc DL(Op);
8002 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8003 DAG.getIntPtrConstant(InsertPSMask, DL, true));
8004 return DAG.getBitcast(VT, Result);
8005}
8006
8007/// Return a vector logical shift node.
8008static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
8009 SelectionDAG &DAG, const TargetLowering &TLI,
8010 const SDLoc &dl) {
8011 assert(VT.is128BitVector() && "Unknown type for VShift")((VT.is128BitVector() && "Unknown type for VShift") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8011, __PRETTY_FUNCTION__))
;
8012 MVT ShVT = MVT::v16i8;
8013 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
8014 SrcOp = DAG.getBitcast(ShVT, SrcOp);
8015 assert(NumBits % 8 == 0 && "Only support byte sized shifts")((NumBits % 8 == 0 && "Only support byte sized shifts"
) ? static_cast<void> (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8015, __PRETTY_FUNCTION__))
;
8016 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
8017 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
8018}
8019
8020static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
8021 SelectionDAG &DAG) {
8022
8023 // Check if the scalar load can be widened into a vector load. And if
8024 // the address is "base + cst" see if the cst can be "absorbed" into
8025 // the shuffle mask.
8026 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
8027 SDValue Ptr = LD->getBasePtr();
8028 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
8029 return SDValue();
8030 EVT PVT = LD->getValueType(0);
8031 if (PVT != MVT::i32 && PVT != MVT::f32)
8032 return SDValue();
8033
8034 int FI = -1;
8035 int64_t Offset = 0;
8036 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
8037 FI = FINode->getIndex();
8038 Offset = 0;
8039 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
8040 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
8041 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
8042 Offset = Ptr.getConstantOperandVal(1);
8043 Ptr = Ptr.getOperand(0);
8044 } else {
8045 return SDValue();
8046 }
8047
8048 // FIXME: 256-bit vector instructions don't require a strict alignment,
8049 // improve this code to support it better.
8050 unsigned RequiredAlign = VT.getSizeInBits()/8;
8051 SDValue Chain = LD->getChain();
8052 // Make sure the stack object alignment is at least 16 or 32.
8053 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8054 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
8055 if (MFI.isFixedObjectIndex(FI)) {
8056 // Can't change the alignment. FIXME: It's possible to compute
8057 // the exact stack offset and reference FI + adjust offset instead.
8058 // If someone *really* cares about this. That's the way to implement it.
8059 return SDValue();
8060 } else {
8061 MFI.setObjectAlignment(FI, RequiredAlign);
8062 }
8063 }
8064
8065 // (Offset % 16 or 32) must be multiple of 4. Then address is then
8066 // Ptr + (Offset & ~15).
8067 if (Offset < 0)
8068 return SDValue();
8069 if ((Offset % RequiredAlign) & 3)
8070 return SDValue();
8071 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
8072 if (StartOffset) {
8073 SDLoc DL(Ptr);
8074 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8075 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
8076 }
8077
8078 int EltNo = (Offset - StartOffset) >> 2;
8079 unsigned NumElems = VT.getVectorNumElements();
8080
8081 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
8082 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
8083 LD->getPointerInfo().getWithOffset(StartOffset));
8084
8085 SmallVector<int, 8> Mask(NumElems, EltNo);
8086
8087 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
8088 }
8089
8090 return SDValue();
8091}
8092
8093// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
8094static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
8095 if (ISD::isNON_EXTLoad(Elt.getNode())) {
8096 auto *BaseLd = cast<LoadSDNode>(Elt);
8097 if (!BaseLd->isSimple())
8098 return false;
8099 Ld = BaseLd;
8100 ByteOffset = 0;
8101 return true;
8102 }
8103
8104 switch (Elt.getOpcode()) {
8105 case ISD::BITCAST:
8106 case ISD::TRUNCATE:
8107 case ISD::SCALAR_TO_VECTOR:
8108 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
8109 case ISD::SRL:
8110 if (isa<ConstantSDNode>(Elt.getOperand(1))) {
8111 uint64_t Idx = Elt.getConstantOperandVal(1);
8112 if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
8113 ByteOffset += Idx / 8;
8114 return true;
8115 }
8116 }
8117 break;
8118 case ISD::EXTRACT_VECTOR_ELT:
8119 if (isa<ConstantSDNode>(Elt.getOperand(1))) {
8120 SDValue Src = Elt.getOperand(0);
8121 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
8122 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
8123 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
8124 findEltLoadSrc(Src, Ld, ByteOffset)) {
8125 uint64_t Idx = Elt.getConstantOperandVal(1);
8126 ByteOffset += Idx * (SrcSizeInBits / 8);
8127 return true;
8128 }
8129 }
8130 break;
8131 }
8132
8133 return false;
8134}
8135
8136/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
8137/// elements can be replaced by a single large load which has the same value as
8138/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
8139///
8140/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
8141static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
8142 const SDLoc &DL, SelectionDAG &DAG,
8143 const X86Subtarget &Subtarget,
8144 bool isAfterLegalize) {
8145 if ((VT.getScalarSizeInBits() % 8) != 0)
8146 return SDValue();
8147
8148 unsigned NumElems = Elts.size();
8149
8150 int LastLoadedElt = -1;
8151 APInt LoadMask = APInt::getNullValue(NumElems);
8152 APInt ZeroMask = APInt::getNullValue(NumElems);
8153 APInt UndefMask = APInt::getNullValue(NumElems);
8154
8155 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
8156 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
8157
8158 // For each element in the initializer, see if we've found a load, zero or an
8159 // undef.
8160 for (unsigned i = 0; i < NumElems; ++i) {
8161 SDValue Elt = peekThroughBitcasts(Elts[i]);
8162 if (!Elt.getNode())
8163 return SDValue();
8164 if (Elt.isUndef()) {
8165 UndefMask.setBit(i);
8166 continue;
8167 }
8168 if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {
8169 ZeroMask.setBit(i);
8170 continue;
8171 }
8172
8173 // Each loaded element must be the correct fractional portion of the
8174 // requested vector load.
8175 unsigned EltSizeInBits = Elt.getValueSizeInBits();
8176 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
8177 return SDValue();
8178
8179 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
8180 return SDValue();
8181 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
8182 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
8183 return SDValue();
8184
8185 LoadMask.setBit(i);
8186 LastLoadedElt = i;
8187 }
8188 assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8190, __PRETTY_FUNCTION__))
8189 LoadMask.countPopulation()) == NumElems &&(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8190, __PRETTY_FUNCTION__))
8190 "Incomplete element masks")(((ZeroMask.countPopulation() + UndefMask.countPopulation() +
LoadMask.countPopulation()) == NumElems && "Incomplete element masks"
) ? static_cast<void> (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8190, __PRETTY_FUNCTION__))
;
8191
8192 // Handle Special Cases - all undef or undef/zero.
8193 if (UndefMask.countPopulation() == NumElems)
8194 return DAG.getUNDEF(VT);
8195
8196 // FIXME: Should we return this as a BUILD_VECTOR instead?
8197 if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)
8198 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
8199 : DAG.getConstantFP(0.0, DL, VT);
8200
8201 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8202 int FirstLoadedElt = LoadMask.countTrailingZeros();
8203 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
8204 EVT EltBaseVT = EltBase.getValueType();
8205 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&((EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits()
&& "Register/Memory size mismatch") ? static_cast<
void> (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8206, __PRETTY_FUNCTION__))
8206 "Register/Memory size mismatch")((EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits()
&& "Register/Memory size mismatch") ? static_cast<
void> (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8206, __PRETTY_FUNCTION__))
;
8207 LoadSDNode *LDBase = Loads[FirstLoadedElt];
8208 assert(LDBase && "Did not find base load for merging consecutive loads")((LDBase && "Did not find base load for merging consecutive loads"
) ? static_cast<void> (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8208, __PRETTY_FUNCTION__))
;
8209 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
8210 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
8211 int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
8212 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"
) ? static_cast<void> (0) : __assert_fail ("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8212, __PRETTY_FUNCTION__))
;
8213
8214 // TODO: Support offsetting the base load.
8215 if (ByteOffsets[FirstLoadedElt] != 0)
8216 return SDValue();
8217
8218 // Check to see if the element's load is consecutive to the base load
8219 // or offset from a previous (already checked) load.
8220 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
8221 LoadSDNode *Ld = Loads[EltIdx];
8222 int64_t ByteOffset = ByteOffsets[EltIdx];
8223 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
8224 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
8225 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
8226 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
8227 }
8228 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
8229 EltIdx - FirstLoadedElt);
8230 };
8231
8232 // Consecutive loads can contain UNDEFS but not ZERO elements.
8233 // Consecutive loads with UNDEFs and ZEROs elements require a
8234 // an additional shuffle stage to clear the ZERO elements.
8235 bool IsConsecutiveLoad = true;
8236 bool IsConsecutiveLoadWithZeros = true;
8237 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
8238 if (LoadMask[i]) {
8239 if (!CheckConsecutiveLoad(LDBase, i)) {
8240 IsConsecutiveLoad = false;
8241 IsConsecutiveLoadWithZeros = false;
8242 break;
8243 }
8244 } else if (ZeroMask[i]) {
8245 IsConsecutiveLoad = false;
8246 }
8247 }
8248
8249 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
8250 auto MMOFlags = LDBase->getMemOperand()->getFlags();
8251 assert(LDBase->isSimple() &&((LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? static_cast<void> (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8252, __PRETTY_FUNCTION__))
8252 "Cannot merge volatile or atomic loads.")((LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? static_cast<void> (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8252, __PRETTY_FUNCTION__))
;
8253 SDValue NewLd =
8254 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
8255 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
8256 for (auto *LD : Loads)
8257 if (LD)
8258 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
8259 return NewLd;
8260 };
8261
8262 // Check if the base load is entirely dereferenceable.
8263 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
8264 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
8265
8266 // LOAD - all consecutive load/undefs (must start/end with a load or be
8267 // entirely dereferenceable). If we have found an entire vector of loads and
8268 // undefs, then return a large load of the entire vector width starting at the
8269 // base pointer. If the vector contains zeros, then attempt to shuffle those
8270 // elements.
8271 if (FirstLoadedElt == 0 &&
8272 (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
8273 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
8274 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
8275 return SDValue();
8276
8277 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
8278 // will lower to regular temporal loads and use the cache.
8279 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
8280 VT.is256BitVector() && !Subtarget.hasInt256())
8281 return SDValue();
8282
8283 if (NumElems == 1)
8284 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
8285
8286 if (!ZeroMask)
8287 return CreateLoad(VT, LDBase);
8288
8289 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
8290 // vector and a zero vector to clear out the zero elements.
8291 if (!isAfterLegalize && VT.isVector()) {
8292 unsigned NumMaskElts = VT.getVectorNumElements();
8293 if ((NumMaskElts % NumElems) == 0) {
8294 unsigned Scale = NumMaskElts / NumElems;
8295 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
8296 for (unsigned i = 0; i < NumElems; ++i) {
8297 if (UndefMask[i])
8298 continue;
8299 int Offset = ZeroMask[i] ? NumMaskElts : 0;
8300 for (unsigned j = 0; j != Scale; ++j)
8301 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
8302 }
8303 SDValue V = CreateLoad(VT, LDBase);
8304 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
8305 : DAG.getConstantFP(0.0, DL, VT);
8306 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
8307 }
8308 }
8309 }
8310
8311 // If the upper half of a ymm/zmm load is undef then just load the lower half.
8312 if (VT.is256BitVector() || VT.is512BitVector()) {
8313 unsigned HalfNumElems = NumElems / 2;
8314 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnesValue()) {
8315 EVT HalfVT =
8316 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
8317 SDValue HalfLD =
8318 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
8319 DAG, Subtarget, isAfterLegalize);
8320 if (HalfLD)
8321 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
8322 HalfLD, DAG.getIntPtrConstant(0, DL));
8323 }
8324 }
8325
8326 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
8327 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
8328 (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
8329 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
8330 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
8331 : MVT::getIntegerVT(LoadSizeInBits);
8332 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
8333 // Allow v4f32 on SSE1 only targets.
8334 // FIXME: Add more isel patterns so we can just use VT directly.
8335 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
8336 VecVT = MVT::v4f32;
8337 if (TLI.isTypeLegal(VecVT)) {
8338 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
8339 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
8340 SDValue ResNode =
8341 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
8342 LDBase->getPointerInfo(),
8343 LDBase->getAlignment(),
8344 MachineMemOperand::MOLoad);
8345 for (auto *LD : Loads)
8346 if (LD)
8347 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
8348 return DAG.getBitcast(VT, ResNode);
8349 }
8350 }
8351
8352 // BROADCAST - match the smallest possible repetition pattern, load that
8353 // scalar/subvector element and then broadcast to the entire vector.
8354 if (ZeroMask.isNullValue() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
8355 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
8356 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
8357 unsigned RepeatSize = SubElems * BaseSizeInBits;
8358 unsigned ScalarSize = std::min(RepeatSize, 64u);
8359 if (!Subtarget.hasAVX2() && ScalarSize < 32)
8360 continue;
8361
8362 bool Match = true;
8363 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
8364 for (unsigned i = 0; i != NumElems && Match; ++i) {
8365 if (!LoadMask[i])
8366 continue;
8367 SDValue Elt = peekThroughBitcasts(Elts[i]);
8368 if (RepeatedLoads[i % SubElems].isUndef())
8369 RepeatedLoads[i % SubElems] = Elt;
8370 else
8371 Match &= (RepeatedLoads[i % SubElems] == Elt);
8372 }
8373
8374 // We must have loads at both ends of the repetition.
8375 Match &= !RepeatedLoads.front().isUndef();
8376 Match &= !RepeatedLoads.back().isUndef();
8377 if (!Match)
8378 continue;
8379
8380 EVT RepeatVT =
8381 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
8382 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
8383 : EVT::getFloatingPointVT(ScalarSize);
8384 if (RepeatSize > ScalarSize)
8385 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
8386 RepeatSize / ScalarSize);
8387 EVT BroadcastVT =
8388 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
8389 VT.getSizeInBits() / ScalarSize);
8390 if (TLI.isTypeLegal(BroadcastVT)) {
8391 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
8392 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) {
8393 unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST
8394 : X86ISD::VBROADCAST;
8395 SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad);
8396 return DAG.getBitcast(VT, Broadcast);
8397 }
8398 }
8399 }
8400 }
8401
8402 return SDValue();
8403}
8404
8405// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
8406// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
8407// are consecutive, non-overlapping, and in the right order.
8408static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
8409 SelectionDAG &DAG,
8410 const X86Subtarget &Subtarget,
8411 bool isAfterLegalize) {
8412 SmallVector<SDValue, 64> Elts;
8413 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
8414 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
8415 Elts.push_back(Elt);
8416 continue;
8417 }
8418 return SDValue();
8419 }
8420 assert(Elts.size() == VT.getVectorNumElements())((Elts.size() == VT.getVectorNumElements()) ? static_cast<
void> (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8420, __PRETTY_FUNCTION__))
;
8421 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
8422 isAfterLegalize);
8423}
8424
8425static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
8426 unsigned SplatBitSize, LLVMContext &C) {
8427 unsigned ScalarSize = VT.getScalarSizeInBits();
8428 unsigned NumElm = SplatBitSize / ScalarSize;
8429
8430 SmallVector<Constant *, 32> ConstantVec;
8431 for (unsigned i = 0; i < NumElm; i++) {
8432 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
8433 Constant *Const;
8434 if (VT.isFloatingPoint()) {
8435 if (ScalarSize == 32) {
8436 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
8437 } else {
8438 assert(ScalarSize == 64 && "Unsupported floating point scalar size")((ScalarSize == 64 && "Unsupported floating point scalar size"
) ? static_cast<void> (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8438, __PRETTY_FUNCTION__))
;
8439 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
8440 }
8441 } else
8442 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
8443 ConstantVec.push_back(Const);
8444 }
8445 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
8446}
8447
8448static bool isFoldableUseOfShuffle(SDNode *N) {
8449 for (auto *U : N->uses()) {
8450 unsigned Opc = U->getOpcode();
8451 // VPERMV/VPERMV3 shuffles can never fold their index operands.
8452 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
8453 return false;
8454 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
8455 return false;
8456 if (isTargetShuffle(Opc))
8457 return true;
8458 if (Opc == ISD::BITCAST) // Ignore bitcasts
8459 return isFoldableUseOfShuffle(U);
8460 if (N->hasOneUse())
8461 return true;
8462 }
8463 return false;
8464}
8465
8466// Check if the current node of build vector is a zero extended vector.
8467// // If so, return the value extended.
8468// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
8469// // NumElt - return the number of zero extended identical values.
8470// // EltType - return the type of the value include the zero extend.
8471static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
8472 unsigned &NumElt, MVT &EltType) {
8473 SDValue ExtValue = Op->getOperand(0);
8474 unsigned NumElts = Op->getNumOperands();
8475 unsigned Delta = NumElts;
8476
8477 for (unsigned i = 1; i < NumElts; i++) {
8478 if (Op->getOperand(i) == ExtValue) {
8479 Delta = i;
8480 break;
8481 }
8482 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
8483 return SDValue();
8484 }
8485 if (!isPowerOf2_32(Delta) || Delta == 1)
8486 return SDValue();
8487
8488 for (unsigned i = Delta; i < NumElts; i++) {
8489 if (i % Delta == 0) {
8490 if (Op->getOperand(i) != ExtValue)
8491 return SDValue();
8492 } else if (!(isNullConstant(Op->getOperand(i)) ||
8493 Op->getOperand(i).isUndef()))
8494 return SDValue();
8495 }
8496 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
8497 unsigned ExtVTSize = EltSize * Delta;
8498 EltType = MVT::getIntegerVT(ExtVTSize);
8499 NumElt = NumElts / Delta;
8500 return ExtValue;
8501}
8502
8503/// Attempt to use the vbroadcast instruction to generate a splat value
8504/// from a splat BUILD_VECTOR which uses:
8505/// a. A single scalar load, or a constant.
8506/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
8507///
8508/// The VBROADCAST node is returned when a pattern is found,
8509/// or SDValue() otherwise.
8510static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
8511 const X86Subtarget &Subtarget,
8512 SelectionDAG &DAG) {
8513 // VBROADCAST requires AVX.
8514 // TODO: Splats could be generated for non-AVX CPUs using SSE
8515 // instructions, but there's less potential gain for only 128-bit vectors.
8516 if (!Subtarget.hasAVX())
8517 return SDValue();
8518
8519 MVT VT = BVOp->getSimpleValueType(0);
8520 SDLoc dl(BVOp);
8521
8522 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8523, __PRETTY_FUNCTION__))
8523 "Unsupported vector type for broadcast.")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported vector type for broadcast.") ? static_cast
<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8523, __PRETTY_FUNCTION__))
;
8524
8525 BitVector UndefElements;
8526 SDValue Ld = BVOp->getSplatValue(&UndefElements);
8527
8528 // Attempt to use VBROADCASTM
8529 // From this pattern:
8530 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
8531 // b. t1 = (build_vector t0 t0)
8532 //
8533 // Create (VBROADCASTM v2i1 X)
8534 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
8535 MVT EltType = VT.getScalarType();
8536 unsigned NumElts = VT.getVectorNumElements();
8537 SDValue BOperand;
8538 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
8539 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
8540 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
8541 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
8542 if (ZeroExtended)
8543 BOperand = ZeroExtended.getOperand(0);
8544 else
8545 BOperand = Ld.getOperand(0).getOperand(0);
8546 MVT MaskVT = BOperand.getSimpleValueType();
8547 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
8548 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
8549 SDValue Brdcst =
8550 DAG.getNode(X86ISD::VBROADCASTM, dl,
8551 MVT::getVectorVT(EltType, NumElts), BOperand);
8552 return DAG.getBitcast(VT, Brdcst);
8553 }
8554 }
8555 }
8556
8557 unsigned NumElts = VT.getVectorNumElements();
8558 unsigned NumUndefElts = UndefElements.count();
8559 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8560 APInt SplatValue, Undef;
8561 unsigned SplatBitSize;
8562 bool HasUndef;
8563 // Check if this is a repeated constant pattern suitable for broadcasting.
8564 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
8565 SplatBitSize > VT.getScalarSizeInBits() &&
8566 SplatBitSize < VT.getSizeInBits()) {
8567 // Avoid replacing with broadcast when it's a use of a shuffle
8568 // instruction to preserve the present custom lowering of shuffles.
8569 if (isFoldableUseOfShuffle(BVOp))
8570 return SDValue();
8571 // replace BUILD_VECTOR with broadcast of the repeated constants.
8572 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8573 LLVMContext *Ctx = DAG.getContext();
8574 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
8575 if (Subtarget.hasAVX()) {
8576 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8577 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
8578 // Splatted value can fit in one INTEGER constant in constant pool.
8579 // Load the constant and broadcast it.
8580 MVT CVT = MVT::getIntegerVT(SplatBitSize);
8581 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
8582 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
8583 SDValue CP = DAG.getConstantPool(C, PVT);
8584 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
8585
8586 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8587 SDVTList Tys =
8588 DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
8589 SDValue Ops[] = {DAG.getEntryNode(), CP};
8590 MachinePointerInfo MPI =
8591 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8592 SDValue Brdcst = DAG.getMemIntrinsicNode(
8593 X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
8594 MachineMemOperand::MOLoad);
8595 return DAG.getBitcast(VT, Brdcst);
8596 }
8597 if (SplatBitSize > 64) {
8598 // Load the vector of constants and broadcast it.
8599 MVT CVT = VT.getScalarType();
8600 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
8601 *Ctx);
8602 SDValue VCP = DAG.getConstantPool(VecC, PVT);
8603 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
8604 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
8605 Ld = DAG.getLoad(
8606 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
8607 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
8608 Alignment);
8609 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
8610 return DAG.getBitcast(VT, Brdcst);
8611 }
8612 }
8613 }
8614
8615 // If we are moving a scalar into a vector (Ld must be set and all elements
8616 // but 1 are undef) and that operation is not obviously supported by
8617 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
8618 // That's better than general shuffling and may eliminate a load to GPR and
8619 // move from scalar to vector register.
8620 if (!Ld || NumElts - NumUndefElts != 1)
8621 return SDValue();
8622 unsigned ScalarSize = Ld.getValueSizeInBits();
8623 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8624 return SDValue();
8625 }
8626
8627 bool ConstSplatVal =
8628 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
8629 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
8630
8631 // Make sure that all of the users of a non-constant load are from the
8632 // BUILD_VECTOR node.
8633 // FIXME: Is the use count needed for non-constant, non-load case?
8634 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
8635 return SDValue();
8636
8637 unsigned ScalarSize = Ld.getValueSizeInBits();
8638 bool IsGE256 = (VT.getSizeInBits() >= 256);
8639
8640 // When optimizing for size, generate up to 5 extra bytes for a broadcast
8641 // instruction to save 8 or more bytes of constant pool data.
8642 // TODO: If multiple splats are generated to load the same constant,
8643 // it may be detrimental to overall size. There needs to be a way to detect
8644 // that condition to know if this is truly a size win.
8645 bool OptForSize = DAG.shouldOptForSize();
8646
8647 // Handle broadcasting a single constant scalar from the constant pool
8648 // into a vector.
8649 // On Sandybridge (no AVX2), it is still better to load a constant vector
8650 // from the constant pool and not to broadcast it from a scalar.
8651 // But override that restriction when optimizing for size.
8652 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
8653 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
8654 EVT CVT = Ld.getValueType();
8655 assert(!CVT.isVector() && "Must not broadcast a vector type")((!CVT.isVector() && "Must not broadcast a vector type"
) ? static_cast<void> (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8655, __PRETTY_FUNCTION__))
;
8656
8657 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
8658 // For size optimization, also splat v2f64 and v2i64, and for size opt
8659 // with AVX2, also splat i8 and i16.
8660 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
8661 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8662 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
8663 const Constant *C = nullptr;
8664 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
8665 C = CI->getConstantIntValue();
8666 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
8667 C = CF->getConstantFPValue();
8668
8669 assert(C && "Invalid constant type")((C && "Invalid constant type") ? static_cast<void
> (0) : __assert_fail ("C && \"Invalid constant type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8669, __PRETTY_FUNCTION__))
;
8670
8671 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8672 SDValue CP =
8673 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
8674 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
8675
8676 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8677 SDValue Ops[] = {DAG.getEntryNode(), CP};
8678 MachinePointerInfo MPI =
8679 MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
8680 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
8681 MPI, Alignment, MachineMemOperand::MOLoad);
8682 }
8683 }
8684
8685 // Handle AVX2 in-register broadcasts.
8686 if (!IsLoad && Subtarget.hasInt256() &&
8687 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8688 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8689
8690 // The scalar source must be a normal load.
8691 if (!IsLoad)
8692 return SDValue();
8693
8694 // Make sure the non-chain result is only used by this build vector.
8695 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
8696 return SDValue();
8697
8698 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8699 (Subtarget.hasVLX() && ScalarSize == 64)) {
8700 auto *LN = cast<LoadSDNode>(Ld);
8701 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8702 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8703 SDValue BCast =
8704 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
8705 LN->getMemoryVT(), LN->getMemOperand());
8706 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8707 return BCast;
8708 }
8709
8710 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
8711 // double since there is no vbroadcastsd xmm
8712 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
8713 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
8714 auto *LN = cast<LoadSDNode>(Ld);
8715 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
8716 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8717 SDValue BCast =
8718 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
8719 LN->getMemoryVT(), LN->getMemOperand());
8720 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
8721 return BCast;
8722 }
8723
8724 // Unsupported broadcast.
8725 return SDValue();
8726}
8727
8728/// For an EXTRACT_VECTOR_ELT with a constant index return the real
8729/// underlying vector and index.
8730///
8731/// Modifies \p ExtractedFromVec to the real vector and returns the real
8732/// index.
8733static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
8734 SDValue ExtIdx) {
8735 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
8736 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
8737 return Idx;
8738
8739 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
8740 // lowered this:
8741 // (extract_vector_elt (v8f32 %1), Constant<6>)
8742 // to:
8743 // (extract_vector_elt (vector_shuffle<2,u,u,u>
8744 // (extract_subvector (v8f32 %0), Constant<4>),
8745 // undef)
8746 // Constant<0>)
8747 // In this case the vector is the extract_subvector expression and the index
8748 // is 2, as specified by the shuffle.
8749 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
8750 SDValue ShuffleVec = SVOp->getOperand(0);
8751 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
8752 assert(ShuffleVecVT.getVectorElementType() ==((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8753, __PRETTY_FUNCTION__))
8753 ExtractedFromVec.getSimpleValueType().getVectorElementType())((ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType
().getVectorElementType()) ? static_cast<void> (0) : __assert_fail
("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8753, __PRETTY_FUNCTION__))
;
8754
8755 int ShuffleIdx = SVOp->getMaskElt(Idx);
8756 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
8757 ExtractedFromVec = ShuffleVec;
8758 return ShuffleIdx;
8759 }
8760 return Idx;
8761}
8762
8763static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
8764 MVT VT = Op.getSimpleValueType();
8765
8766 // Skip if insert_vec_elt is not supported.
8767 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8768 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
8769 return SDValue();
8770
8771 SDLoc DL(Op);
8772 unsigned NumElems = Op.getNumOperands();
8773
8774 SDValue VecIn1;
8775 SDValue VecIn2;
8776 SmallVector<unsigned, 4> InsertIndices;
8777 SmallVector<int, 8> Mask(NumElems, -1);
8778
8779 for (unsigned i = 0; i != NumElems; ++i) {
8780 unsigned Opc = Op.getOperand(i).getOpcode();
8781
8782 if (Opc == ISD::UNDEF)
8783 continue;
8784
8785 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
8786 // Quit if more than 1 elements need inserting.
8787 if (InsertIndices.size() > 1)
8788 return SDValue();
8789
8790 InsertIndices.push_back(i);
8791 continue;
8792 }
8793
8794 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
8795 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
8796
8797 // Quit if non-constant index.
8798 if (!isa<ConstantSDNode>(ExtIdx))
8799 return SDValue();
8800 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
8801
8802 // Quit if extracted from vector of different type.
8803 if (ExtractedFromVec.getValueType() != VT)
8804 return SDValue();
8805
8806 if (!VecIn1.getNode())
8807 VecIn1 = ExtractedFromVec;
8808 else if (VecIn1 != ExtractedFromVec) {
8809 if (!VecIn2.getNode())
8810 VecIn2 = ExtractedFromVec;
8811 else if (VecIn2 != ExtractedFromVec)
8812 // Quit if more than 2 vectors to shuffle
8813 return SDValue();
8814 }
8815
8816 if (ExtractedFromVec == VecIn1)
8817 Mask[i] = Idx;
8818 else if (ExtractedFromVec == VecIn2)
8819 Mask[i] = Idx + NumElems;
8820 }
8821
8822 if (!VecIn1.getNode())
8823 return SDValue();
8824
8825 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
8826 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
8827
8828 for (unsigned Idx : InsertIndices)
8829 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
8830 DAG.getIntPtrConstant(Idx, DL));
8831
8832 return NV;
8833}
8834
8835// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8836static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
8837 const X86Subtarget &Subtarget) {
8838
8839 MVT VT = Op.getSimpleValueType();
8840 assert((VT.getVectorElementType() == MVT::i1) &&(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8841, __PRETTY_FUNCTION__))
8841 "Unexpected type in LowerBUILD_VECTORvXi1!")(((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8841, __PRETTY_FUNCTION__))
;
8842
8843 SDLoc dl(Op);
8844 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8845 ISD::isBuildVectorAllOnes(Op.getNode()))
8846 return Op;
8847
8848 uint64_t Immediate = 0;
8849 SmallVector<unsigned, 16> NonConstIdx;
8850 bool IsSplat = true;
8851 bool HasConstElts = false;
8852 int SplatIdx = -1;
8853 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8854 SDValue In = Op.getOperand(idx);
8855 if (In.isUndef())
8856 continue;
8857 if (!isa<ConstantSDNode>(In))
8858 NonConstIdx.push_back(idx);
8859 else {
8860 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
8861 HasConstElts = true;
8862 }
8863 if (SplatIdx < 0)
8864 SplatIdx = idx;
8865 else if (In != Op.getOperand(SplatIdx))
8866 IsSplat = false;
8867 }
8868
8869 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8870 if (IsSplat) {
8871 // The build_vector allows the scalar element to be larger than the vector
8872 // element type. We need to mask it to use as a condition unless we know
8873 // the upper bits are zero.
8874 // FIXME: Use computeKnownBits instead of checking specific opcode?
8875 SDValue Cond = Op.getOperand(SplatIdx);
8876 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")((Cond.getValueType() == MVT::i8 && "Unexpected VT!")
? static_cast<void> (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8876, __PRETTY_FUNCTION__))
;
8877 if (Cond.getOpcode() != ISD::SETCC)
8878 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8879 DAG.getConstant(1, dl, MVT::i8));
8880
8881 // Perform the select in the scalar domain so we can use cmov.
8882 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8883 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8884 DAG.getAllOnesConstant(dl, MVT::i32),
8885 DAG.getConstant(0, dl, MVT::i32));
8886 Select = DAG.getBitcast(MVT::v32i1, Select);
8887 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8888 } else {
8889 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8890 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8891 DAG.getAllOnesConstant(dl, ImmVT),
8892 DAG.getConstant(0, dl, ImmVT));
8893 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8894 Select = DAG.getBitcast(VecVT, Select);
8895 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8896 DAG.getIntPtrConstant(0, dl));
8897 }
8898 }
8899
8900 // insert elements one by one
8901 SDValue DstVec;
8902 if (HasConstElts) {
8903 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8904 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8905 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8906 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8907 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8908 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8909 } else {
8910 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8911 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8912 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8913 DstVec = DAG.getBitcast(VecVT, Imm);
8914 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8915 DAG.getIntPtrConstant(0, dl));
8916 }
8917 } else
8918 DstVec = DAG.getUNDEF(VT);
8919
8920 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
8921 unsigned InsertIdx = NonConstIdx[i];
8922 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8923 Op.getOperand(InsertIdx),
8924 DAG.getIntPtrConstant(InsertIdx, dl));
8925 }
8926 return DstVec;
8927}
8928
8929/// This is a helper function of LowerToHorizontalOp().
8930/// This function checks that the build_vector \p N in input implements a
8931/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8932/// may not match the layout of an x86 256-bit horizontal instruction.
8933/// In other words, if this returns true, then some extraction/insertion will
8934/// be required to produce a valid horizontal instruction.
8935///
8936/// Parameter \p Opcode defines the kind of horizontal operation to match.
8937/// For example, if \p Opcode is equal to ISD::ADD, then this function
8938/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8939/// is equal to ISD::SUB, then this function checks if this is a horizontal
8940/// arithmetic sub.
8941///
8942/// This function only analyzes elements of \p N whose indices are
8943/// in range [BaseIdx, LastIdx).
8944///
8945/// TODO: This function was originally used to match both real and fake partial
8946/// horizontal operations, but the index-matching logic is incorrect for that.
8947/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8948/// code because it is only used for partial h-op matching now?
8949static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8950 SelectionDAG &DAG,
8951 unsigned BaseIdx, unsigned LastIdx,
8952 SDValue &V0, SDValue &V1) {
8953 EVT VT = N->getValueType(0);
8954 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")((VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8954, __PRETTY_FUNCTION__))
;
8955 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")((BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"
) ? static_cast<void> (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8955, __PRETTY_FUNCTION__))
;
8956 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8957, __PRETTY_FUNCTION__))
8957 "Invalid Vector in input!")((VT.isVector() && VT.getVectorNumElements() >= LastIdx
&& "Invalid Vector in input!") ? static_cast<void
> (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 8957, __PRETTY_FUNCTION__))
;
8958
8959 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8960 bool CanFold = true;
8961 unsigned ExpectedVExtractIdx = BaseIdx;
8962 unsigned NumElts = LastIdx - BaseIdx;
8963 V0 = DAG.getUNDEF(VT);
8964 V1 = DAG.getUNDEF(VT);
8965
8966 // Check if N implements a horizontal binop.
8967 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8968 SDValue Op = N->getOperand(i + BaseIdx);
8969
8970 // Skip UNDEFs.
8971 if (Op->isUndef()) {
8972 // Update the expected vector extract index.
8973 if (i * 2 == NumElts)
8974 ExpectedVExtractIdx = BaseIdx;
8975 ExpectedVExtractIdx += 2;
8976 continue;
8977 }
8978
8979 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8980
8981 if (!CanFold)
8982 break;
8983
8984 SDValue Op0 = Op.getOperand(0);
8985 SDValue Op1 = Op.getOperand(1);
8986
8987 // Try to match the following pattern:
8988 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8989 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8990 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8991 Op0.getOperand(0) == Op1.getOperand(0) &&
8992 isa<ConstantSDNode>(Op0.getOperand(1)) &&
8993 isa<ConstantSDNode>(Op1.getOperand(1)));
8994 if (!CanFold)
8995 break;
8996
8997 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
8998 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
8999
9000 if (i * 2 < NumElts) {
9001 if (V0.isUndef()) {
9002 V0 = Op0.getOperand(0);
9003 if (V0.getValueType() != VT)
9004 return false;
9005 }
9006 } else {
9007 if (V1.isUndef()) {
9008 V1 = Op0.getOperand(0);
9009 if (V1.getValueType() != VT)
9010 return false;
9011 }
9012 if (i * 2 == NumElts)
9013 ExpectedVExtractIdx = BaseIdx;
9014 }
9015
9016 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
9017 if (I0 == ExpectedVExtractIdx)
9018 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
9019 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
9020 // Try to match the following dag sequence:
9021 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
9022 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
9023 } else
9024 CanFold = false;
9025
9026 ExpectedVExtractIdx += 2;
9027 }
9028
9029 return CanFold;
9030}
9031
9032/// Emit a sequence of two 128-bit horizontal add/sub followed by
9033/// a concat_vector.
9034///
9035/// This is a helper function of LowerToHorizontalOp().
9036/// This function expects two 256-bit vectors called V0 and V1.
9037/// At first, each vector is split into two separate 128-bit vectors.
9038/// Then, the resulting 128-bit vectors are used to implement two
9039/// horizontal binary operations.
9040///
9041/// The kind of horizontal binary operation is defined by \p X86Opcode.
9042///
9043/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
9044/// the two new horizontal binop.
9045/// When Mode is set, the first horizontal binop dag node would take as input
9046/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
9047/// horizontal binop dag node would take as input the lower 128-bit of V1
9048/// and the upper 128-bit of V1.
9049/// Example:
9050/// HADD V0_LO, V0_HI
9051/// HADD V1_LO, V1_HI
9052///
9053/// Otherwise, the first horizontal binop dag node takes as input the lower
9054/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
9055/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
9056/// Example:
9057/// HADD V0_LO, V1_LO
9058/// HADD V0_HI, V1_HI
9059///
9060/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
9061/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
9062/// the upper 128-bits of the result.
9063static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
9064 const SDLoc &DL, SelectionDAG &DAG,
9065 unsigned X86Opcode, bool Mode,
9066 bool isUndefLO, bool isUndefHI) {
9067 MVT VT = V0.getSimpleValueType();
9068 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&((VT.is256BitVector() && VT == V1.getSimpleValueType(
) && "Invalid nodes in input!") ? static_cast<void
> (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9069, __PRETTY_FUNCTION__))
9069 "Invalid nodes in input!")((VT.is256BitVector() && VT == V1.getSimpleValueType(
) && "Invalid nodes in input!") ? static_cast<void
> (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9069, __PRETTY_FUNCTION__))
;
9070
9071 unsigned NumElts = VT.getVectorNumElements();
9072 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
9073 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
9074 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
9075 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
9076 MVT NewVT = V0_LO.getSimpleValueType();
9077
9078 SDValue LO = DAG.getUNDEF(NewVT);
9079 SDValue HI = DAG.getUNDEF(NewVT);
9080
9081 if (Mode) {
9082 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9083 if (!isUndefLO && !V0->isUndef())
9084 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
9085 if (!isUndefHI && !V1->isUndef())
9086 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
9087 } else {
9088 // Don't emit a horizontal binop if the result is expected to be UNDEF.
9089 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
9090 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
9091
9092 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
9093 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
9094 }
9095
9096 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
9097}
9098
9099/// Returns true iff \p BV builds a vector with the result equivalent to
9100/// the result of ADDSUB/SUBADD operation.
9101/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
9102/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
9103/// \p Opnd0 and \p Opnd1.
9104static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
9105 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9106 SDValue &Opnd0, SDValue &Opnd1,
9107 unsigned &NumExtracts,
9108 bool &IsSubAdd) {
9109
9110 MVT VT = BV->getSimpleValueType(0);
9111 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
9112 return false;
9113
9114 unsigned NumElts = VT.getVectorNumElements();
9115 SDValue InVec0 = DAG.getUNDEF(VT);
9116 SDValue InVec1 = DAG.getUNDEF(VT);
9117
9118 NumExtracts = 0;
9119
9120 // Odd-numbered elements in the input build vector are obtained from
9121 // adding/subtracting two integer/float elements.
9122 // Even-numbered elements in the input build vector are obtained from
9123 // subtracting/adding two integer/float elements.
9124 unsigned Opc[2] = {0, 0};
9125 for (unsigned i = 0, e = NumElts; i != e; ++i) {
9126 SDValue Op = BV->getOperand(i);
9127
9128 // Skip 'undef' values.
9129 unsigned Opcode = Op.getOpcode();
9130 if (Opcode == ISD::UNDEF)
9131 continue;
9132
9133 // Early exit if we found an unexpected opcode.
9134 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
9135 return false;
9136
9137 SDValue Op0 = Op.getOperand(0);
9138 SDValue Op1 = Op.getOperand(1);
9139
9140 // Try to match the following pattern:
9141 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
9142 // Early exit if we cannot match that sequence.
9143 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9144 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9145 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9146 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
9147 Op0.getOperand(1) != Op1.getOperand(1))
9148 return false;
9149
9150 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
9151 if (I0 != i)
9152 return false;
9153
9154 // We found a valid add/sub node, make sure its the same opcode as previous
9155 // elements for this parity.
9156 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
9157 return false;
9158 Opc[i % 2] = Opcode;
9159
9160 // Update InVec0 and InVec1.
9161 if (InVec0.isUndef()) {
9162 InVec0 = Op0.getOperand(0);
9163 if (InVec0.getSimpleValueType() != VT)
9164 return false;
9165 }
9166 if (InVec1.isUndef()) {
9167 InVec1 = Op1.getOperand(0);
9168 if (InVec1.getSimpleValueType() != VT)
9169 return false;
9170 }
9171
9172 // Make sure that operands in input to each add/sub node always
9173 // come from a same pair of vectors.
9174 if (InVec0 != Op0.getOperand(0)) {
9175 if (Opcode == ISD::FSUB)
9176 return false;
9177
9178 // FADD is commutable. Try to commute the operands
9179 // and then test again.
9180 std::swap(Op0, Op1);
9181 if (InVec0 != Op0.getOperand(0))
9182 return false;
9183 }
9184
9185 if (InVec1 != Op1.getOperand(0))
9186 return false;
9187
9188 // Increment the number of extractions done.
9189 ++NumExtracts;
9190 }
9191
9192 // Ensure we have found an opcode for both parities and that they are
9193 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
9194 // inputs are undef.
9195 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
9196 InVec0.isUndef() || InVec1.isUndef())
9197 return false;
9198
9199 IsSubAdd = Opc[0] == ISD::FADD;
9200
9201 Opnd0 = InVec0;
9202 Opnd1 = InVec1;
9203 return true;
9204}
9205
9206/// Returns true if is possible to fold MUL and an idiom that has already been
9207/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
9208/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
9209/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
9210///
9211/// Prior to calling this function it should be known that there is some
9212/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
9213/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
9214/// before replacement of such SDNode with ADDSUB operation. Thus the number
9215/// of \p Opnd0 uses is expected to be equal to 2.
9216/// For example, this function may be called for the following IR:
9217/// %AB = fmul fast <2 x double> %A, %B
9218/// %Sub = fsub fast <2 x double> %AB, %C
9219/// %Add = fadd fast <2 x double> %AB, %C
9220/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
9221/// <2 x i32> <i32 0, i32 3>
9222/// There is a def for %Addsub here, which potentially can be replaced by
9223/// X86ISD::ADDSUB operation:
9224/// %Addsub = X86ISD::ADDSUB %AB, %C
9225/// and such ADDSUB can further be replaced with FMADDSUB:
9226/// %Addsub = FMADDSUB %A, %B, %C.
9227///
9228/// The main reason why this method is called before the replacement of the
9229/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
9230/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
9231/// FMADDSUB is.
9232static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
9233 SelectionDAG &DAG,
9234 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
9235 unsigned ExpectedUses) {
9236 if (Opnd0.getOpcode() != ISD::FMUL ||
9237 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
9238 return false;
9239
9240 // FIXME: These checks must match the similar ones in
9241 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
9242 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
9243 // or MUL + ADDSUB to FMADDSUB.
9244 const TargetOptions &Options = DAG.getTarget().Options;
9245 bool AllowFusion =
9246 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
9247 if (!AllowFusion)
9248 return false;
9249
9250 Opnd2 = Opnd1;
9251 Opnd1 = Opnd0.getOperand(1);
9252 Opnd0 = Opnd0.getOperand(0);
9253
9254 return true;
9255}
9256
9257/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
9258/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
9259/// X86ISD::FMSUBADD node.
9260static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
9261 const X86Subtarget &Subtarget,
9262 SelectionDAG &DAG) {
9263 SDValue Opnd0, Opnd1;
9264 unsigned NumExtracts;
9265 bool IsSubAdd;
9266 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
9267 IsSubAdd))
9268 return SDValue();
9269
9270 MVT VT = BV->getSimpleValueType(0);
9271 SDLoc DL(BV);
9272
9273 // Try to generate X86ISD::FMADDSUB node here.
9274 SDValue Opnd2;
9275 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
9276 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
9277 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
9278 }
9279
9280 // We only support ADDSUB.
9281 if (IsSubAdd)
9282 return SDValue();
9283
9284 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
9285 // the ADDSUB idiom has been successfully recognized. There are no known
9286 // X86 targets with 512-bit ADDSUB instructions!
9287 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
9288 // recognition.
9289 if (VT.is512BitVector())
9290 return SDValue();
9291
9292 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
9293}
9294
9295static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
9296 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
9297 // Initialize outputs to known values.
9298 MVT VT = BV->getSimpleValueType(0);
9299 HOpcode = ISD::DELETED_NODE;
9300 V0 = DAG.getUNDEF(VT);
9301 V1 = DAG.getUNDEF(VT);
9302
9303 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
9304 // half of the result is calculated independently from the 128-bit halves of
9305 // the inputs, so that makes the index-checking logic below more complicated.
9306 unsigned NumElts = VT.getVectorNumElements();
9307 unsigned GenericOpcode = ISD::DELETED_NODE;
9308 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
9309 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
9310 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
9311 for (unsigned i = 0; i != Num128BitChunks; ++i) {
9312 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
9313 // Ignore undef elements.
9314 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
9315 if (Op.isUndef())
9316 continue;
9317
9318 // If there's an opcode mismatch, we're done.
9319 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
9320 return false;
9321
9322 // Initialize horizontal opcode.
9323 if (HOpcode == ISD::DELETED_NODE) {
9324 GenericOpcode = Op.getOpcode();
9325 switch (GenericOpcode) {
9326 case ISD::ADD: HOpcode = X86ISD::HADD; break;
9327 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
9328 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
9329 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
9330 default: return false;
9331 }
9332 }
9333
9334 SDValue Op0 = Op.getOperand(0);
9335 SDValue Op1 = Op.getOperand(1);
9336 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9337 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9338 Op0.getOperand(0) != Op1.getOperand(0) ||
9339 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
9340 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
9341 return false;
9342
9343 // The source vector is chosen based on which 64-bit half of the
9344 // destination vector is being calculated.
9345 if (j < NumEltsIn64Bits) {
9346 if (V0.isUndef())
9347 V0 = Op0.getOperand(0);
9348 } else {
9349 if (V1.isUndef())
9350 V1 = Op0.getOperand(0);
9351 }
9352
9353 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
9354 if (SourceVec != Op0.getOperand(0))
9355 return false;
9356
9357 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
9358 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
9359 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
9360 unsigned ExpectedIndex = i * NumEltsIn128Bits +
9361 (j % NumEltsIn64Bits) * 2;
9362 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9363 continue;
9364
9365 // If this is not a commutative op, this does not match.
9366 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
9367 return false;
9368
9369 // Addition is commutative, so try swapping the extract indexes.
9370 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
9371 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9372 continue;
9373
9374 // Extract indexes do not match horizontal requirement.
9375 return false;
9376 }
9377 }
9378 // We matched. Opcode and operands are returned by reference as arguments.
9379 return true;
9380}
9381
9382static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
9383 SelectionDAG &DAG, unsigned HOpcode,
9384 SDValue V0, SDValue V1) {
9385 // If either input vector is not the same size as the build vector,
9386 // extract/insert the low bits to the correct size.
9387 // This is free (examples: zmm --> xmm, xmm --> ymm).
9388 MVT VT = BV->getSimpleValueType(0);
9389 unsigned Width = VT.getSizeInBits();
9390 if (V0.getValueSizeInBits() > Width)
9391 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
9392 else if (V0.getValueSizeInBits() < Width)
9393 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
9394
9395 if (V1.getValueSizeInBits() > Width)
9396 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
9397 else if (V1.getValueSizeInBits() < Width)
9398 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
9399
9400 unsigned NumElts = VT.getVectorNumElements();
9401 APInt DemandedElts = APInt::getAllOnesValue(NumElts);
9402 for (unsigned i = 0; i != NumElts; ++i)
9403 if (BV->getOperand(i).isUndef())
9404 DemandedElts.clearBit(i);
9405
9406 // If we don't need the upper xmm, then perform as a xmm hop.
9407 unsigned HalfNumElts = NumElts / 2;
9408 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
9409 MVT HalfVT = VT.getHalfNumVectorElementsVT();
9410 V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);
9411 V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);
9412 SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);
9413 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);
9414 }
9415
9416 return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
9417}
9418
9419/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
9420static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
9421 const X86Subtarget &Subtarget,
9422 SelectionDAG &DAG) {
9423 // We need at least 2 non-undef elements to make this worthwhile by default.
9424 unsigned NumNonUndefs =
9425 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
9426 if (NumNonUndefs < 2)
9427 return SDValue();
9428
9429 // There are 4 sets of horizontal math operations distinguished by type:
9430 // int/FP at 128-bit/256-bit. Each type was introduced with a different
9431 // subtarget feature. Try to match those "native" patterns first.
9432 MVT VT = BV->getSimpleValueType(0);
9433 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
9434 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
9435 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
9436 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
9437 unsigned HOpcode;
9438 SDValue V0, V1;
9439 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
9440 return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
9441 }
9442
9443 // Try harder to match 256-bit ops by using extract/concat.
9444 if (!Subtarget.hasAVX() || !VT.is256BitVector())
9445 return SDValue();
9446
9447 // Count the number of UNDEF operands in the build_vector in input.
9448 unsigned NumElts = VT.getVectorNumElements();
9449 unsigned Half = NumElts / 2;
9450 unsigned NumUndefsLO = 0;
9451 unsigned NumUndefsHI = 0;
9452 for (unsigned i = 0, e = Half; i != e; ++i)
9453 if (BV->getOperand(i)->isUndef())
9454 NumUndefsLO++;
9455
9456 for (unsigned i = Half, e = NumElts; i != e; ++i)
9457 if (BV->getOperand(i)->isUndef())
9458 NumUndefsHI++;
9459
9460 SDLoc DL(BV);
9461 SDValue InVec0, InVec1;
9462 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9463 SDValue InVec2, InVec3;
9464 unsigned X86Opcode;
9465 bool CanFold = true;
9466
9467 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
9468 isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
9469 InVec3) &&
9470 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9471 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9472 X86Opcode = X86ISD::HADD;
9473 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
9474 InVec1) &&
9475 isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
9476 InVec3) &&
9477 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
9478 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
9479 X86Opcode = X86ISD::HSUB;
9480 else
9481 CanFold = false;
9482
9483 if (CanFold) {
9484 // Do not try to expand this build_vector into a pair of horizontal
9485 // add/sub if we can emit a pair of scalar add/sub.
9486 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9487 return SDValue();
9488
9489 // Convert this build_vector into a pair of horizontal binops followed by
9490 // a concat vector. We must adjust the outputs from the partial horizontal
9491 // matching calls above to account for undefined vector halves.
9492 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
9493 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
9494 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"
) ? static_cast<void> (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9494, __PRETTY_FUNCTION__))
;
9495 bool isUndefLO = NumUndefsLO == Half;
9496 bool isUndefHI = NumUndefsHI == Half;
9497 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
9498 isUndefHI);
9499 }
9500 }
9501
9502 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9503 VT == MVT::v16i16) {
9504 unsigned X86Opcode;
9505 if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
9506 X86Opcode = X86ISD::HADD;
9507 else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
9508 InVec1))
9509 X86Opcode = X86ISD::HSUB;
9510 else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
9511 InVec1))
9512 X86Opcode = X86ISD::FHADD;
9513 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
9514 InVec1))
9515 X86Opcode = X86ISD::FHSUB;
9516 else
9517 return SDValue();
9518
9519 // Don't try to expand this build_vector into a pair of horizontal add/sub
9520 // if we can simply emit a pair of scalar add/sub.
9521 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9522 return SDValue();
9523
9524 // Convert this build_vector into two horizontal add/sub followed by
9525 // a concat vector.
9526 bool isUndefLO = NumUndefsLO == Half;
9527 bool isUndefHI = NumUndefsHI == Half;
9528 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
9529 isUndefLO, isUndefHI);
9530 }
9531
9532 return SDValue();
9533}
9534
9535/// If a BUILD_VECTOR's source elements all apply the same bit operation and
9536/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
9537/// just apply the bit to the vectors.
9538/// NOTE: Its not in our interest to start make a general purpose vectorizer
9539/// from this, but enough scalar bit operations are created from the later
9540/// legalization + scalarization stages to need basic support.
9541static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
9542 SelectionDAG &DAG) {
9543 SDLoc DL(Op);
9544 MVT VT = Op->getSimpleValueType(0);
9545 unsigned NumElems = VT.getVectorNumElements();
9546 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9547
9548 // Check that all elements have the same opcode.
9549 // TODO: Should we allow UNDEFS and if so how many?
9550 unsigned Opcode = Op->getOperand(0).getOpcode();
9551 for (unsigned i = 1; i < NumElems; ++i)
9552 if (Opcode != Op->getOperand(i).getOpcode())
9553 return SDValue();
9554
9555 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
9556 bool IsShift = false;
9557 switch (Opcode) {
9558 default:
9559 return SDValue();
9560 case ISD::SHL:
9561 case ISD::SRL:
9562 case ISD::SRA:
9563 IsShift = true;
9564 break;
9565 case ISD::AND:
9566 case ISD::XOR:
9567 case ISD::OR:
9568 // Don't do this if the buildvector is a splat - we'd replace one
9569 // constant with an entire vector.
9570 if (Op->getSplatValue())
9571 return SDValue();
9572 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
9573 return SDValue();
9574 break;
9575 }
9576
9577 SmallVector<SDValue, 4> LHSElts, RHSElts;
9578 for (SDValue Elt : Op->ops()) {
9579 SDValue LHS = Elt.getOperand(0);
9580 SDValue RHS = Elt.getOperand(1);
9581
9582 // We expect the canonicalized RHS operand to be the constant.
9583 if (!isa<ConstantSDNode>(RHS))
9584 return SDValue();
9585
9586 // Extend shift amounts.
9587 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
9588 if (!IsShift)
9589 return SDValue();
9590 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
9591 }
9592
9593 LHSElts.push_back(LHS);
9594 RHSElts.push_back(RHS);
9595 }
9596
9597 // Limit to shifts by uniform immediates.
9598 // TODO: Only accept vXi8/vXi64 special cases?
9599 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
9600 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
9601 return SDValue();
9602
9603 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
9604 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
9605 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
9606}
9607
9608/// Create a vector constant without a load. SSE/AVX provide the bare minimum
9609/// functionality to do this, so it's all zeros, all ones, or some derivation
9610/// that is cheap to calculate.
9611static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
9612 const X86Subtarget &Subtarget) {
9613 SDLoc DL(Op);
9614 MVT VT = Op.getSimpleValueType();
9615
9616 // Vectors containing all zeros can be matched by pxor and xorps.
9617 if (ISD::isBuildVectorAllZeros(Op.getNode()))
9618 return Op;
9619
9620 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
9621 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
9622 // vpcmpeqd on 256-bit vectors.
9623 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
9624 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9625 return Op;
9626
9627 return getOnesVector(VT, DAG, DL);
9628 }
9629
9630 return SDValue();
9631}
9632
9633/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
9634/// from a vector of source values and a vector of extraction indices.
9635/// The vectors might be manipulated to match the type of the permute op.
9636static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
9637 SDLoc &DL, SelectionDAG &DAG,
9638 const X86Subtarget &Subtarget) {
9639 MVT ShuffleVT = VT;
9640 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9641 unsigned NumElts = VT.getVectorNumElements();
9642 unsigned SizeInBits = VT.getSizeInBits();
9643
9644 // Adjust IndicesVec to match VT size.
9645 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&((IndicesVec.getValueType().getVectorNumElements() >= NumElts
&& "Illegal variable permute mask size") ? static_cast
<void> (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9646, __PRETTY_FUNCTION__))
9646 "Illegal variable permute mask size")((IndicesVec.getValueType().getVectorNumElements() >= NumElts
&& "Illegal variable permute mask size") ? static_cast
<void> (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9646, __PRETTY_FUNCTION__))
;
9647 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
9648 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
9649 NumElts * VT.getScalarSizeInBits());
9650 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
9651
9652 // Handle SrcVec that don't match VT type.
9653 if (SrcVec.getValueSizeInBits() != SizeInBits) {
9654 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
9655 // Handle larger SrcVec by treating it as a larger permute.
9656 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
9657 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
9658 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
9659 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
9660 Subtarget, DAG, SDLoc(IndicesVec));
9661 SDValue NewSrcVec =
9662 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9663 if (NewSrcVec)
9664 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
9665 return SDValue();
9666 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
9667 // Widen smaller SrcVec to match VT.
9668 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
9669 } else
9670 return SDValue();
9671 }
9672
9673 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
9674 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")((isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9674, __PRETTY_FUNCTION__))
;
9675 EVT SrcVT = Idx.getValueType();
9676 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
9677 uint64_t IndexScale = 0;
9678 uint64_t IndexOffset = 0;
9679
9680 // If we're scaling a smaller permute op, then we need to repeat the
9681 // indices, scaling and offsetting them as well.
9682 // e.g. v4i32 -> v16i8 (Scale = 4)
9683 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
9684 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
9685 for (uint64_t i = 0; i != Scale; ++i) {
9686 IndexScale |= Scale << (i * NumDstBits);
9687 IndexOffset |= i << (i * NumDstBits);
9688 }
9689
9690 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
9691 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
9692 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
9693 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
9694 return Idx;
9695 };
9696
9697 unsigned Opcode = 0;
9698 switch (VT.SimpleTy) {
9699 default:
9700 break;
9701 case MVT::v16i8:
9702 if (Subtarget.hasSSSE3())
9703 Opcode = X86ISD::PSHUFB;
9704 break;
9705 case MVT::v8i16:
9706 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9707 Opcode = X86ISD::VPERMV;
9708 else if (Subtarget.hasSSSE3()) {
9709 Opcode = X86ISD::PSHUFB;
9710 ShuffleVT = MVT::v16i8;
9711 }
9712 break;
9713 case MVT::v4f32:
9714 case MVT::v4i32:
9715 if (Subtarget.hasAVX()) {
9716 Opcode = X86ISD::VPERMILPV;
9717 ShuffleVT = MVT::v4f32;
9718 } else if (Subtarget.hasSSSE3()) {
9719 Opcode = X86ISD::PSHUFB;
9720 ShuffleVT = MVT::v16i8;
9721 }
9722 break;
9723 case MVT::v2f64:
9724 case MVT::v2i64:
9725 if (Subtarget.hasAVX()) {
9726 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
9727 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9728 Opcode = X86ISD::VPERMILPV;
9729 ShuffleVT = MVT::v2f64;
9730 } else if (Subtarget.hasSSE41()) {
9731 // SSE41 can compare v2i64 - select between indices 0 and 1.
9732 return DAG.getSelectCC(
9733 DL, IndicesVec,
9734 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
9735 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
9736 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
9737 ISD::CondCode::SETEQ);
9738 }
9739 break;
9740 case MVT::v32i8:
9741 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9742 Opcode = X86ISD::VPERMV;
9743 else if (Subtarget.hasXOP()) {
9744 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
9745 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9746 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9747 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9748 return DAG.getNode(
9749 ISD::CONCAT_VECTORS, DL, VT,
9750 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9751 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9752 } else if (Subtarget.hasAVX()) {
9753 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9754 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9755 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9756 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9757 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9758 ArrayRef<SDValue> Ops) {
9759 // Permute Lo and Hi and then select based on index range.
9760 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9761 // care about the bit[7] as its just an index vector.
9762 SDValue Idx = Ops[2];
9763 EVT VT = Idx.getValueType();
9764 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9765 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9766 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9767 ISD::CondCode::SETGT);
9768 };
9769 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9770 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9771 PSHUFBBuilder);
9772 }
9773 break;
9774 case MVT::v16i16:
9775 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9776 Opcode = X86ISD::VPERMV;
9777 else if (Subtarget.hasAVX()) {
9778 // Scale to v32i8 and perform as v32i8.
9779 IndicesVec = ScaleIndices(IndicesVec, 2);
9780 return DAG.getBitcast(
9781 VT, createVariablePermute(
9782 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9783 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9784 }
9785 break;
9786 case MVT::v8f32:
9787 case MVT::v8i32:
9788 if (Subtarget.hasAVX2())
9789 Opcode = X86ISD::VPERMV;
9790 else if (Subtarget.hasAVX()) {
9791 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9792 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9793 {0, 1, 2, 3, 0, 1, 2, 3});
9794 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9795 {4, 5, 6, 7, 4, 5, 6, 7});
9796 if (Subtarget.hasXOP())
9797 return DAG.getBitcast(
9798 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9799 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9800 // Permute Lo and Hi and then select based on index range.
9801 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9802 SDValue Res = DAG.getSelectCC(
9803 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9804 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9805 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9806 ISD::CondCode::SETGT);
9807 return DAG.getBitcast(VT, Res);
9808 }
9809 break;
9810 case MVT::v4i64:
9811 case MVT::v4f64:
9812 if (Subtarget.hasAVX512()) {
9813 if (!Subtarget.hasVLX()) {
9814 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9815 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9816 SDLoc(SrcVec));
9817 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9818 DAG, SDLoc(IndicesVec));
9819 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9820 DAG, Subtarget);
9821 return extract256BitVector(Res, 0, DAG, DL);
9822 }
9823 Opcode = X86ISD::VPERMV;
9824 } else if (Subtarget.hasAVX()) {
9825 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9826 SDValue LoLo =
9827 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9828 SDValue HiHi =
9829 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9830 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9831 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9832 if (Subtarget.hasXOP())
9833 return DAG.getBitcast(
9834 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9835 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9836 // Permute Lo and Hi and then select based on index range.
9837 // This works as VPERMILPD only uses index bit[1] to permute elements.
9838 SDValue Res = DAG.getSelectCC(
9839 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9840 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9841 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9842 ISD::CondCode::SETGT);
9843 return DAG.getBitcast(VT, Res);
9844 }
9845 break;
9846 case MVT::v64i8:
9847 if (Subtarget.hasVBMI())
9848 Opcode = X86ISD::VPERMV;
9849 break;
9850 case MVT::v32i16:
9851 if (Subtarget.hasBWI())
9852 Opcode = X86ISD::VPERMV;
9853 break;
9854 case MVT::v16f32:
9855 case MVT::v16i32:
9856 case MVT::v8f64:
9857 case MVT::v8i64:
9858 if (Subtarget.hasAVX512())
9859 Opcode = X86ISD::VPERMV;
9860 break;
9861 }
9862 if (!Opcode)
9863 return SDValue();
9864
9865 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9867, __PRETTY_FUNCTION__))
9866 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9867, __PRETTY_FUNCTION__))
9867 "Illegal variable permute shuffle type")(((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits())
== 0 && "Illegal variable permute shuffle type") ? static_cast
<void> (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9867, __PRETTY_FUNCTION__))
;
9868
9869 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9870 if (Scale > 1)
9871 IndicesVec = ScaleIndices(IndicesVec, Scale);
9872
9873 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9874 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9875
9876 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9877 SDValue Res = Opcode == X86ISD::VPERMV
9878 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9879 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9880 return DAG.getBitcast(VT, Res);
9881}
9882
9883// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9884// reasoned to be a permutation of a vector by indices in a non-constant vector.
9885// (build_vector (extract_elt V, (extract_elt I, 0)),
9886// (extract_elt V, (extract_elt I, 1)),
9887// ...
9888// ->
9889// (vpermv I, V)
9890//
9891// TODO: Handle undefs
9892// TODO: Utilize pshufb and zero mask blending to support more efficient
9893// construction of vectors with constant-0 elements.
9894static SDValue
9895LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
9896 const X86Subtarget &Subtarget) {
9897 SDValue SrcVec, IndicesVec;
9898 // Check for a match of the permute source vector and permute index elements.
9899 // This is done by checking that the i-th build_vector operand is of the form:
9900 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9901 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9902 SDValue Op = V.getOperand(Idx);
9903 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9904 return SDValue();
9905
9906 // If this is the first extract encountered in V, set the source vector,
9907 // otherwise verify the extract is from the previously defined source
9908 // vector.
9909 if (!SrcVec)
9910 SrcVec = Op.getOperand(0);
9911 else if (SrcVec != Op.getOperand(0))
9912 return SDValue();
9913 SDValue ExtractedIndex = Op->getOperand(1);
9914 // Peek through extends.
9915 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9916 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9917 ExtractedIndex = ExtractedIndex.getOperand(0);
9918 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9919 return SDValue();
9920
9921 // If this is the first extract from the index vector candidate, set the
9922 // indices vector, otherwise verify the extract is from the previously
9923 // defined indices vector.
9924 if (!IndicesVec)
9925 IndicesVec = ExtractedIndex.getOperand(0);
9926 else if (IndicesVec != ExtractedIndex.getOperand(0))
9927 return SDValue();
9928
9929 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9930 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9931 return SDValue();
9932 }
9933
9934 SDLoc DL(V);
9935 MVT VT = V.getSimpleValueType();
9936 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9937}
9938
9939SDValue
9940X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9941 SDLoc dl(Op);
9942
9943 MVT VT = Op.getSimpleValueType();
9944 MVT EltVT = VT.getVectorElementType();
9945 unsigned NumElems = Op.getNumOperands();
9946
9947 // Generate vectors for predicate vectors.
9948 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9949 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
9950
9951 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
9952 return VectorConstant;
9953
9954 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9955 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
9956 return AddSub;
9957 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
9958 return HorizontalOp;
9959 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
9960 return Broadcast;
9961 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
9962 return BitOp;
9963
9964 unsigned EVTBits = EltVT.getSizeInBits();
9965
9966 unsigned NumZero = 0;
9967 unsigned NumNonZero = 0;
9968 uint64_t NonZeros = 0;
9969 bool IsAllConstants = true;
9970 SmallSet<SDValue, 8> Values;
9971 unsigned NumConstants = NumElems;
9972 for (unsigned i = 0; i < NumElems; ++i) {
9973 SDValue Elt = Op.getOperand(i);
9974 if (Elt.isUndef())
9975 continue;
9976 Values.insert(Elt);
9977 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
9978 IsAllConstants = false;
9979 NumConstants--;
9980 }
9981 if (X86::isZeroNode(Elt))
9982 NumZero++;
9983 else {
9984 assert(i < sizeof(NonZeros) * 8)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * 8", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 9984, __PRETTY_FUNCTION__))
; // Make sure the shift is within range.
9985 NonZeros |= ((uint64_t)1 << i);
9986 NumNonZero++;
9987 }
9988 }
9989
9990 // All undef vector. Return an UNDEF. All zero vectors were handled above.
9991 if (NumNonZero == 0)
9992 return DAG.getUNDEF(VT);
9993
9994 // If we are inserting one variable into a vector of non-zero constants, try
9995 // to avoid loading each constant element as a scalar. Load the constants as a
9996 // vector and then insert the variable scalar element. If insertion is not
9997 // supported, fall back to a shuffle to get the scalar blended with the
9998 // constants. Insertion into a zero vector is handled as a special-case
9999 // somewhere below here.
10000 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
10001 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
10002 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
10003 // Create an all-constant vector. The variable element in the old
10004 // build vector is replaced by undef in the constant vector. Save the
10005 // variable scalar element and its index for use in the insertelement.
10006 LLVMContext &Context = *DAG.getContext();
10007 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
10008 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
10009 SDValue VarElt;
10010 SDValue InsIndex;
10011 for (unsigned i = 0; i != NumElems; ++i) {
10012 SDValue Elt = Op.getOperand(i);
10013 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
10014 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
10015 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
10016 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
10017 else if (!Elt.isUndef()) {
10018 assert(!VarElt.getNode() && !InsIndex.getNode() &&((!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector") ? static_cast
<void> (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10019, __PRETTY_FUNCTION__))
10019 "Expected one variable element in this vector")((!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector") ? static_cast
<void> (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10019, __PRETTY_FUNCTION__))
;
10020 VarElt = Elt;
10021 InsIndex = DAG.getVectorIdxConstant(i, dl);
10022 }
10023 }
10024 Constant *CV = ConstantVector::get(ConstVecOps);
10025 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
10026
10027 // The constants we just created may not be legal (eg, floating point). We
10028 // must lower the vector right here because we can not guarantee that we'll
10029 // legalize it before loading it. This is also why we could not just create
10030 // a new build vector here. If the build vector contains illegal constants,
10031 // it could get split back up into a series of insert elements.
10032 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
10033 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
10034 MachineFunction &MF = DAG.getMachineFunction();
10035 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
10036 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
10037 unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
10038 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
10039 if (InsertC < NumEltsInLow128Bits)
10040 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
10041
10042 // There's no good way to insert into the high elements of a >128-bit
10043 // vector, so use shuffles to avoid an extract/insert sequence.
10044 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")((VT.getSizeInBits() > 128 && "Invalid insertion index?"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10044, __PRETTY_FUNCTION__))
;
10045 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")((Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10045, __PRETTY_FUNCTION__))
;
10046 SmallVector<int, 8> ShuffleMask;
10047 unsigned NumElts = VT.getVectorNumElements();
10048 for (unsigned i = 0; i != NumElts; ++i)
10049 ShuffleMask.push_back(i == InsertC ? NumElts : i);
10050 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
10051 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
10052 }
10053
10054 // Special case for single non-zero, non-undef, element.
10055 if (NumNonZero == 1) {
10056 unsigned Idx = countTrailingZeros(NonZeros);
10057 SDValue Item = Op.getOperand(Idx);
10058
10059 // If we have a constant or non-constant insertion into the low element of
10060 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
10061 // the rest of the elements. This will be matched as movd/movq/movss/movsd
10062 // depending on what the source datatype is.
10063 if (Idx == 0) {
10064 if (NumZero == 0)
10065 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10066
10067 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
10068 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
10069 assert((VT.is128BitVector() || VT.is256BitVector() ||(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10071, __PRETTY_FUNCTION__))
10070 VT.is512BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10071, __PRETTY_FUNCTION__))
10071 "Expected an SSE value type!")(((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector
()) && "Expected an SSE value type!") ? static_cast<
void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10071, __PRETTY_FUNCTION__))
;
10072 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10073 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
10074 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10075 }
10076
10077 // We can't directly insert an i8 or i16 into a vector, so zero extend
10078 // it to i32 first.
10079 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
10080 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
10081 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
10082 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
10083 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
10084 return DAG.getBitcast(VT, Item);
10085 }
10086 }
10087
10088 // Is it a vector logical left shift?
10089 if (NumElems == 2 && Idx == 1 &&
10090 X86::isZeroNode(Op.getOperand(0)) &&
10091 !X86::isZeroNode(Op.getOperand(1))) {
10092 unsigned NumBits = VT.getSizeInBits();
10093 return getVShift(true, VT,
10094 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
10095 VT, Op.getOperand(1)),
10096 NumBits/2, DAG, *this, dl);
10097 }
10098
10099 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
10100 return SDValue();
10101
10102 // Otherwise, if this is a vector with i32 or f32 elements, and the element
10103 // is a non-constant being inserted into an element other than the low one,
10104 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
10105 // movd/movss) to move this into the low element, then shuffle it into
10106 // place.
10107 if (EVTBits == 32) {
10108 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
10109 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
10110 }
10111 }
10112
10113 // Splat is obviously ok. Let legalizer expand it to a shuffle.
10114 if (Values.size() == 1) {
10115 if (EVTBits == 32) {
10116 // Instead of a shuffle like this:
10117 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
10118 // Check if it's possible to issue this instead.
10119 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
10120 unsigned Idx = countTrailingZeros(NonZeros);
10121 SDValue Item = Op.getOperand(Idx);
10122 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
10123 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
10124 }
10125 return SDValue();
10126 }
10127
10128 // A vector full of immediates; various special cases are already
10129 // handled, so this is best done with a single constant-pool load.
10130 if (IsAllConstants)
10131 return SDValue();
10132
10133 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
10134 return V;
10135
10136 // See if we can use a vector load to get all of the elements.
10137 {
10138 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
10139 if (SDValue LD =
10140 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
10141 return LD;
10142 }
10143
10144 // If this is a splat of pairs of 32-bit elements, we can use a narrower
10145 // build_vector and broadcast it.
10146 // TODO: We could probably generalize this more.
10147 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
10148 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
10149 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
10150 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
10151 // Make sure all the even/odd operands match.
10152 for (unsigned i = 2; i != NumElems; ++i)
10153 if (Ops[i % 2] != Op.getOperand(i))
10154 return false;
10155 return true;
10156 };
10157 if (CanSplat(Op, NumElems, Ops)) {
10158 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
10159 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
10160 // Create a new build vector and cast to v2i64/v2f64.
10161 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
10162 DAG.getBuildVector(NarrowVT, dl, Ops));
10163 // Broadcast from v2i64/v2f64 and cast to final VT.
10164 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
10165 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
10166 NewBV));
10167 }
10168 }
10169
10170 // For AVX-length vectors, build the individual 128-bit pieces and use
10171 // shuffles to put them in place.
10172 if (VT.getSizeInBits() > 128) {
10173 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
10174
10175 // Build both the lower and upper subvector.
10176 SDValue Lower =
10177 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
10178 SDValue Upper = DAG.getBuildVector(
10179 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
10180
10181 // Recreate the wider vector with the lower and upper part.
10182 return concatSubVectors(Lower, Upper, DAG, dl);
10183 }
10184
10185 // Let legalizer expand 2-wide build_vectors.
10186 if (EVTBits == 64) {
10187 if (NumNonZero == 1) {
10188 // One half is zero or undef.
10189 unsigned Idx = countTrailingZeros(NonZeros);
10190 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
10191 Op.getOperand(Idx));
10192 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
10193 }
10194 return SDValue();
10195 }
10196
10197 // If element VT is < 32 bits, convert it to inserts into a zero vector.
10198 if (EVTBits == 8 && NumElems == 16)
10199 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
10200 DAG, Subtarget))
10201 return V;
10202
10203 if (EVTBits == 16 && NumElems == 8)
10204 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
10205 DAG, Subtarget))
10206 return V;
10207
10208 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
10209 if (EVTBits == 32 && NumElems == 4)
10210 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
10211 return V;
10212
10213 // If element VT is == 32 bits, turn it into a number of shuffles.
10214 if (NumElems == 4 && NumZero > 0) {
10215 SmallVector<SDValue, 8> Ops(NumElems);
10216 for (unsigned i = 0; i < 4; ++i) {
10217 bool isZero = !(NonZeros & (1ULL << i));
10218 if (isZero)
10219 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
10220 else
10221 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10222 }
10223
10224 for (unsigned i = 0; i < 2; ++i) {
10225 switch ((NonZeros >> (i*2)) & 0x3) {
10226 default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10226)
;
10227 case 0:
10228 Ops[i] = Ops[i*2]; // Must be a zero vector.
10229 break;
10230 case 1:
10231 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
10232 break;
10233 case 2:
10234 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10235 break;
10236 case 3:
10237 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
10238 break;
10239 }
10240 }
10241
10242 bool Reverse1 = (NonZeros & 0x3) == 2;
10243 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
10244 int MaskVec[] = {
10245 Reverse1 ? 1 : 0,
10246 Reverse1 ? 0 : 1,
10247 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10248 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10249 };
10250 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
10251 }
10252
10253 assert(Values.size() > 1 && "Expected non-undef and non-splat vector")((Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? static_cast<void> (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10253, __PRETTY_FUNCTION__))
;
10254
10255 // Check for a build vector from mostly shuffle plus few inserting.
10256 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
10257 return Sh;
10258
10259 // For SSE 4.1, use insertps to put the high elements into the low element.
10260 if (Subtarget.hasSSE41()) {
10261 SDValue Result;
10262 if (!Op.getOperand(0).isUndef())
10263 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
10264 else
10265 Result = DAG.getUNDEF(VT);
10266
10267 for (unsigned i = 1; i < NumElems; ++i) {
10268 if (Op.getOperand(i).isUndef()) continue;
10269 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
10270 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
10271 }
10272 return Result;
10273 }
10274
10275 // Otherwise, expand into a number of unpckl*, start by extending each of
10276 // our (non-undef) elements to the full vector width with the element in the
10277 // bottom slot of the vector (which generates no code for SSE).
10278 SmallVector<SDValue, 8> Ops(NumElems);
10279 for (unsigned i = 0; i < NumElems; ++i) {
10280 if (!Op.getOperand(i).isUndef())
10281 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
10282 else
10283 Ops[i] = DAG.getUNDEF(VT);
10284 }
10285
10286 // Next, we iteratively mix elements, e.g. for v4f32:
10287 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
10288 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
10289 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
10290 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10291 // Generate scaled UNPCKL shuffle mask.
10292 SmallVector<int, 16> Mask;
10293 for(unsigned i = 0; i != Scale; ++i)
10294 Mask.push_back(i);
10295 for (unsigned i = 0; i != Scale; ++i)
10296 Mask.push_back(NumElems+i);
10297 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
10298
10299 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
10300 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
10301 }
10302 return Ops[0];
10303}
10304
10305// 256-bit AVX can use the vinsertf128 instruction
10306// to create 256-bit vectors from two other 128-bit ones.
10307// TODO: Detect subvector broadcast here instead of DAG combine?
10308static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
10309 const X86Subtarget &Subtarget) {
10310 SDLoc dl(Op);
10311 MVT ResVT = Op.getSimpleValueType();
10312
10313 assert((ResVT.is256BitVector() ||(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10314, __PRETTY_FUNCTION__))
10314 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
"Value type must be 256-/512-bit wide") ? static_cast<void
> (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10314, __PRETTY_FUNCTION__))
;
10315
10316 unsigned NumOperands = Op.getNumOperands();
10317 unsigned NumZero = 0;
10318 unsigned NumNonZero = 0;
10319 unsigned NonZeros = 0;
10320 for (unsigned i = 0; i != NumOperands; ++i) {
10321 SDValue SubVec = Op.getOperand(i);
10322 if (SubVec.isUndef())
10323 continue;
10324 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10325 ++NumZero;
10326 else {
10327 assert(i < sizeof(NonZeros) * CHAR_BIT)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10327, __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
10328 NonZeros |= 1 << i;
10329 ++NumNonZero;
10330 }
10331 }
10332
10333 // If we have more than 2 non-zeros, build each half separately.
10334 if (NumNonZero > 2) {
10335 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10336 ArrayRef<SDUse> Ops = Op->ops();
10337 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10338 Ops.slice(0, NumOperands/2));
10339 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10340 Ops.slice(NumOperands/2));
10341 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10342 }
10343
10344 // Otherwise, build it up through insert_subvectors.
10345 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
10346 : DAG.getUNDEF(ResVT);
10347
10348 MVT SubVT = Op.getOperand(0).getSimpleValueType();
10349 unsigned NumSubElems = SubVT.getVectorNumElements();
10350 for (unsigned i = 0; i != NumOperands; ++i) {
10351 if ((NonZeros & (1 << i)) == 0)
10352 continue;
10353
10354 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
10355 Op.getOperand(i),
10356 DAG.getIntPtrConstant(i * NumSubElems, dl));
10357 }
10358
10359 return Vec;
10360}
10361
10362// Returns true if the given node is a type promotion (by concatenating i1
10363// zeros) of the result of a node that already zeros all upper bits of
10364// k-register.
10365// TODO: Merge this with LowerAVXCONCAT_VECTORS?
10366static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
10367 const X86Subtarget &Subtarget,
10368 SelectionDAG & DAG) {
10369 SDLoc dl(Op);
10370 MVT ResVT = Op.getSimpleValueType();
10371 unsigned NumOperands = Op.getNumOperands();
10372
10373 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&((NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS") ? static_cast
<void> (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10374, __PRETTY_FUNCTION__))
10374 "Unexpected number of operands in CONCAT_VECTORS")((NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS") ? static_cast
<void> (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10374, __PRETTY_FUNCTION__))
;
10375
10376 uint64_t Zeros = 0;
10377 uint64_t NonZeros = 0;
10378 for (unsigned i = 0; i != NumOperands; ++i) {
10379 SDValue SubVec = Op.getOperand(i);
10380 if (SubVec.isUndef())
10381 continue;
10382 assert(i < sizeof(NonZeros) * CHAR_BIT)((i < sizeof(NonZeros) * 8) ? static_cast<void> (0) :
__assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10382, __PRETTY_FUNCTION__))
; // Ensure the shift is in range.
10383 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
10384 Zeros |= (uint64_t)1 << i;
10385 else
10386 NonZeros |= (uint64_t)1 << i;
10387 }
10388
10389 unsigned NumElems = ResVT.getVectorNumElements();
10390
10391 // If we are inserting non-zero vector and there are zeros in LSBs and undef
10392 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
10393 // insert_subvector will give us two kshifts.
10394 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10395 Log2_64(NonZeros) != NumOperands - 1) {
10396 MVT ShiftVT = ResVT;
10397 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
10398 ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
10399 unsigned Idx = Log2_64(NonZeros);
10400 SDValue SubVec = Op.getOperand(Idx);
10401 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10402 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,
10403 DAG.getUNDEF(ShiftVT), SubVec,
10404 DAG.getIntPtrConstant(0, dl));
10405 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,
10406 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
10407 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
10408 DAG.getIntPtrConstant(0, dl));
10409 }
10410
10411 // If there are zero or one non-zeros we can handle this very simply.
10412 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
10413 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
10414 if (!NonZeros)
10415 return Vec;
10416 unsigned Idx = Log2_64(NonZeros);
10417 SDValue SubVec = Op.getOperand(Idx);
10418 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
10419 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
10420 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
10421 }
10422
10423 if (NumOperands > 2) {
10424 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
10425 ArrayRef<SDUse> Ops = Op->ops();
10426 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10427 Ops.slice(0, NumOperands/2));
10428 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
10429 Ops.slice(NumOperands/2));
10430 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
10431 }
10432
10433 assert(countPopulation(NonZeros) == 2 && "Simple cases not handled?")((countPopulation(NonZeros) == 2 && "Simple cases not handled?"
) ? static_cast<void> (0) : __assert_fail ("countPopulation(NonZeros) == 2 && \"Simple cases not handled?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10433, __PRETTY_FUNCTION__))
;
10434
10435 if (ResVT.getVectorNumElements() >= 16)
10436 return Op; // The operation is legal with KUNPCK
10437
10438 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
10439 DAG.getUNDEF(ResVT), Op.getOperand(0),
10440 DAG.getIntPtrConstant(0, dl));
10441 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
10442 DAG.getIntPtrConstant(NumElems/2, dl));
10443}
10444
10445static SDValue LowerCONCAT_VECTORS(SDValue Op,
10446 const X86Subtarget &Subtarget,
10447 SelectionDAG &DAG) {
10448 MVT VT = Op.getSimpleValueType();
10449 if (VT.getVectorElementType() == MVT::i1)
10450 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
10451
10452 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10454, __PRETTY_FUNCTION__))
10453 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10454, __PRETTY_FUNCTION__))
10454 Op.getNumOperands() == 4)))(((VT.is256BitVector() && Op.getNumOperands() == 2) ||
(VT.is512BitVector() && (Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4))) ? static_cast<void> (0) : __assert_fail
("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10454, __PRETTY_FUNCTION__))
;
10455
10456 // AVX can use the vinsertf128 instruction to create 256-bit vectors
10457 // from two other 128-bit ones.
10458
10459 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
10460 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
10461}
10462
10463//===----------------------------------------------------------------------===//
10464// Vector shuffle lowering
10465//
10466// This is an experimental code path for lowering vector shuffles on x86. It is
10467// designed to handle arbitrary vector shuffles and blends, gracefully
10468// degrading performance as necessary. It works hard to recognize idiomatic
10469// shuffles and lower them to optimal instruction patterns without leaving
10470// a framework that allows reasonably efficient handling of all vector shuffle
10471// patterns.
10472//===----------------------------------------------------------------------===//
10473
10474/// Tiny helper function to identify a no-op mask.
10475///
10476/// This is a somewhat boring predicate function. It checks whether the mask
10477/// array input, which is assumed to be a single-input shuffle mask of the kind
10478/// used by the X86 shuffle instructions (not a fully general
10479/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
10480/// in-place shuffle are 'no-op's.
10481static bool isNoopShuffleMask(ArrayRef<int> Mask) {
10482 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10483 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10483, __PRETTY_FUNCTION__))
;
10484 if (Mask[i] >= 0 && Mask[i] != i)
10485 return false;
10486 }
10487 return true;
10488}
10489
10490/// Test whether there are elements crossing LaneSizeInBits lanes in this
10491/// shuffle mask.
10492///
10493/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
10494/// and we routinely test for these.
10495static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
10496 unsigned ScalarSizeInBits,
10497 ArrayRef<int> Mask) {
10498 assert(LaneSizeInBits && ScalarSizeInBits &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10500, __PRETTY_FUNCTION__))
10499 (LaneSizeInBits % ScalarSizeInBits) == 0 &&((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10500, __PRETTY_FUNCTION__))
10500 "Illegal shuffle lane size")((LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits
% ScalarSizeInBits) == 0 && "Illegal shuffle lane size"
) ? static_cast<void> (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10500, __PRETTY_FUNCTION__))
;
10501 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10502 int Size = Mask.size();
10503 for (int i = 0; i < Size; ++i)
10504 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10505 return true;
10506 return false;
10507}
10508
10509/// Test whether there are elements crossing 128-bit lanes in this
10510/// shuffle mask.
10511static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
10512 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
10513}
10514
10515/// Test whether a shuffle mask is equivalent within each sub-lane.
10516///
10517/// This checks a shuffle mask to see if it is performing the same
10518/// lane-relative shuffle in each sub-lane. This trivially implies
10519/// that it is also not lane-crossing. It may however involve a blend from the
10520/// same lane of a second vector.
10521///
10522/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
10523/// non-trivial to compute in the face of undef lanes. The representation is
10524/// suitable for use with existing 128-bit shuffles as entries from the second
10525/// vector have been remapped to [LaneSize, 2*LaneSize).
10526static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10527 ArrayRef<int> Mask,
10528 SmallVectorImpl<int> &RepeatedMask) {
10529 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10530 RepeatedMask.assign(LaneSize, -1);
10531 int Size = Mask.size();
10532 for (int i = 0; i < Size; ++i) {
10533 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)((Mask[i] == SM_SentinelUndef || Mask[i] >= 0) ? static_cast
<void> (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10533, __PRETTY_FUNCTION__))
;
10534 if (Mask[i] < 0)
10535 continue;
10536 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10537 // This entry crosses lanes, so there is no way to model this shuffle.
10538 return false;
10539
10540 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10541 // Adjust second vector indices to start at LaneSize instead of Size.
10542 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10543 : Mask[i] % LaneSize + LaneSize;
10544 if (RepeatedMask[i % LaneSize] < 0)
10545 // This is the first non-undef entry in this slot of a 128-bit lane.
10546 RepeatedMask[i % LaneSize] = LocalM;
10547 else if (RepeatedMask[i % LaneSize] != LocalM)
10548 // Found a mismatch with the repeated mask.
10549 return false;
10550 }
10551 return true;
10552}
10553
10554/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10555static bool
10556is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10557 SmallVectorImpl<int> &RepeatedMask) {
10558 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10559}
10560
10561static bool
10562is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
10563 SmallVector<int, 32> RepeatedMask;
10564 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10565}
10566
10567/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10568static bool
10569is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
10570 SmallVectorImpl<int> &RepeatedMask) {
10571 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10572}
10573
10574/// Test whether a target shuffle mask is equivalent within each sub-lane.
10575/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10576static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10577 ArrayRef<int> Mask,
10578 SmallVectorImpl<int> &RepeatedMask) {
10579 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10580 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10581 int Size = Mask.size();
10582 for (int i = 0; i < Size; ++i) {
10583 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))((isUndefOrZero(Mask[i]) || (Mask[i] >= 0)) ? static_cast<
void> (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10583, __PRETTY_FUNCTION__))
;
10584 if (Mask[i] == SM_SentinelUndef)
10585 continue;
10586 if (Mask[i] == SM_SentinelZero) {
10587 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10588 return false;
10589 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10590 continue;
10591 }
10592 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10593 // This entry crosses lanes, so there is no way to model this shuffle.
10594 return false;
10595
10596 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10597 // Adjust second vector indices to start at LaneSize instead of Size.
10598 int LocalM =
10599 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
10600 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10601 // This is the first non-undef entry in this slot of a 128-bit lane.
10602 RepeatedMask[i % LaneSize] = LocalM;
10603 else if (RepeatedMask[i % LaneSize] != LocalM)
10604 // Found a mismatch with the repeated mask.
10605 return false;
10606 }
10607 return true;
10608}
10609
10610/// Checks whether a shuffle mask is equivalent to an explicit list of
10611/// arguments.
10612///
10613/// This is a fast way to test a shuffle mask against a fixed pattern:
10614///
10615/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10616///
10617/// It returns true if the mask is exactly as wide as the argument list, and
10618/// each element of the mask is either -1 (signifying undef) or the value given
10619/// in the argument.
10620static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
10621 ArrayRef<int> ExpectedMask) {
10622 if (Mask.size() != ExpectedMask.size())
10623 return false;
10624
10625 int Size = Mask.size();
10626
10627 // If the values are build vectors, we can look through them to find
10628 // equivalent inputs that make the shuffles equivalent.
10629 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
10630 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
10631
10632 for (int i = 0; i < Size; ++i) {
10633 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10633, __PRETTY_FUNCTION__))
;
10634 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
10635 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
10636 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
10637 if (!MaskBV || !ExpectedBV ||
10638 MaskBV->getOperand(Mask[i] % Size) !=
10639 ExpectedBV->getOperand(ExpectedMask[i] % Size))
10640 return false;
10641 }
10642 }
10643
10644 return true;
10645}
10646
10647/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10648///
10649/// The masks must be exactly the same width.
10650///
10651/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10652/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10653///
10654/// SM_SentinelZero is accepted as a valid negative index but must match in
10655/// both.
10656static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
10657 ArrayRef<int> ExpectedMask,
10658 SDValue V1 = SDValue(),
10659 SDValue V2 = SDValue()) {
10660 int Size = Mask.size();
10661 if (Size != (int)ExpectedMask.size())
10662 return false;
10663 assert(isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&((isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask") ? static_cast<void> (0)
: __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10664, __PRETTY_FUNCTION__))
10664 "Illegal target shuffle mask")((isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) &&
"Illegal target shuffle mask") ? static_cast<void> (0)
: __assert_fail ("isUndefOrZeroOrInRange(ExpectedMask, 0, 2 * Size) && \"Illegal target shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10664, __PRETTY_FUNCTION__))
;
10665
10666 // Check for out-of-range target shuffle mask indices.
10667 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10668 return false;
10669
10670 // If the values are build vectors, we can look through them to find
10671 // equivalent inputs that make the shuffles equivalent.
10672 auto *BV1 = dyn_cast_or_null<BuildVectorSDNode>(V1);
10673 auto *BV2 = dyn_cast_or_null<BuildVectorSDNode>(V2);
10674 BV1 = ((BV1 && Size != (int)BV1->getNumOperands()) ? nullptr : BV1);
10675 BV2 = ((BV2 && Size != (int)BV2->getNumOperands()) ? nullptr : BV2);
10676
10677 for (int i = 0; i < Size; ++i) {
10678 if (Mask[i] == SM_SentinelUndef || Mask[i] == ExpectedMask[i])
10679 continue;
10680 if (0 <= Mask[i] && 0 <= ExpectedMask[i]) {
10681 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
10682 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
10683 if (MaskBV && ExpectedBV &&
10684 MaskBV->getOperand(Mask[i] % Size) ==
10685 ExpectedBV->getOperand(ExpectedMask[i] % Size))
10686 continue;
10687 }
10688 // TODO - handle SM_Sentinel equivalences.
10689 return false;
10690 }
10691 return true;
10692}
10693
10694// Attempt to create a shuffle mask from a VSELECT condition mask.
10695static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
10696 SDValue Cond) {
10697 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
10698 return false;
10699
10700 unsigned Size = Cond.getValueType().getVectorNumElements();
10701 Mask.resize(Size, SM_SentinelUndef);
10702
10703 for (int i = 0; i != (int)Size; ++i) {
10704 SDValue CondElt = Cond.getOperand(i);
10705 Mask[i] = i;
10706 // Arbitrarily choose from the 2nd operand if the select condition element
10707 // is undef.
10708 // TODO: Can we do better by matching patterns such as even/odd?
10709 if (CondElt.isUndef() || isNullConstant(CondElt))
10710 Mask[i] += Size;
10711 }
10712
10713 return true;
10714}
10715
10716// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10717// instructions.
10718static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
10719 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10720 return false;
10721
10722 SmallVector<int, 8> Unpcklwd;
10723 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10724 /* Unary = */ false);
10725 SmallVector<int, 8> Unpckhwd;
10726 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10727 /* Unary = */ false);
10728 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
10729 isTargetShuffleEquivalent(Mask, Unpckhwd));
10730 return IsUnpackwdMask;
10731}
10732
10733static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
10734 // Create 128-bit vector type based on mask size.
10735 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10736 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10737
10738 // We can't assume a canonical shuffle mask, so try the commuted version too.
10739 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10740 ShuffleVectorSDNode::commuteMask(CommutedMask);
10741
10742 // Match any of unary/binary or low/high.
10743 for (unsigned i = 0; i != 4; ++i) {
10744 SmallVector<int, 16> UnpackMask;
10745 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10746 if (isTargetShuffleEquivalent(Mask, UnpackMask) ||
10747 isTargetShuffleEquivalent(CommutedMask, UnpackMask))
10748 return true;
10749 }
10750 return false;
10751}
10752
10753/// Return true if a shuffle mask chooses elements identically in its top and
10754/// bottom halves. For example, any splat mask has the same top and bottom
10755/// halves. If an element is undefined in only one half of the mask, the halves
10756/// are not considered identical.
10757static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {
10758 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")((Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10758, __PRETTY_FUNCTION__))
;
10759 unsigned HalfSize = Mask.size() / 2;
10760 for (unsigned i = 0; i != HalfSize; ++i) {
10761 if (Mask[i] != Mask[i + HalfSize])
10762 return false;
10763 }
10764 return true;
10765}
10766
10767/// Get a 4-lane 8-bit shuffle immediate for a mask.
10768///
10769/// This helper function produces an 8-bit shuffle immediate corresponding to
10770/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10771/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10772/// example.
10773///
10774/// NB: We rely heavily on "undef" masks preserving the input lane.
10775static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10776 assert(Mask.size() == 4 && "Only 4-lane shuffle masks")((Mask.size() == 4 && "Only 4-lane shuffle masks") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10776, __PRETTY_FUNCTION__))
;
10777 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10777, __PRETTY_FUNCTION__))
;
10778 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10778, __PRETTY_FUNCTION__))
;
10779 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10779, __PRETTY_FUNCTION__))
;
10780 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10780, __PRETTY_FUNCTION__))
;
10781
10782 unsigned Imm = 0;
10783 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10784 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10785 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10786 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10787 return Imm;
10788}
10789
10790static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
10791 SelectionDAG &DAG) {
10792 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10793}
10794
10795// The Shuffle result is as follow:
10796// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10797// Each Zeroable's element correspond to a particular Mask's element.
10798// As described in computeZeroableShuffleElements function.
10799//
10800// The function looks for a sub-mask that the nonzero elements are in
10801// increasing order. If such sub-mask exist. The function returns true.
10802static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10803 ArrayRef<int> Mask, const EVT &VectorType,
10804 bool &IsZeroSideLeft) {
10805 int NextElement = -1;
10806 // Check if the Mask's nonzero elements are in increasing order.
10807 for (int i = 0, e = Mask.size(); i < e; i++) {
10808 // Checks if the mask's zeros elements are built from only zeros.
10809 assert(Mask[i] >= -1 && "Out of bound mask element!")((Mask[i] >= -1 && "Out of bound mask element!") ?
static_cast<void> (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10809, __PRETTY_FUNCTION__))
;
10810 if (Mask[i] < 0)
10811 return false;
10812 if (Zeroable[i])
10813 continue;
10814 // Find the lowest non zero element
10815 if (NextElement < 0) {
10816 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10817 IsZeroSideLeft = NextElement != 0;
10818 }
10819 // Exit if the mask's non zero elements are not in increasing order.
10820 if (NextElement != Mask[i])
10821 return false;
10822 NextElement++;
10823 }
10824 return true;
10825}
10826
10827/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10828static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
10829 ArrayRef<int> Mask, SDValue V1,
10830 SDValue V2, const APInt &Zeroable,
10831 const X86Subtarget &Subtarget,
10832 SelectionDAG &DAG) {
10833 int Size = Mask.size();
10834 int LaneSize = 128 / VT.getScalarSizeInBits();
10835 const int NumBytes = VT.getSizeInBits() / 8;
10836 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10837
10838 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10840, __PRETTY_FUNCTION__))
10839 (Subtarget.hasAVX2() && VT.is256BitVector()) ||(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10840, __PRETTY_FUNCTION__))
10840 (Subtarget.hasBWI() && VT.is512BitVector()))(((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget
.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI
() && VT.is512BitVector())) ? static_cast<void>
(0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10840, __PRETTY_FUNCTION__))
;
10841
10842 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10843 // Sign bit set in i8 mask means zero element.
10844 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10845
10846 SDValue V;
10847 for (int i = 0; i < NumBytes; ++i) {
10848 int M = Mask[i / NumEltBytes];
10849 if (M < 0) {
10850 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10851 continue;
10852 }
10853 if (Zeroable[i / NumEltBytes]) {
10854 PSHUFBMask[i] = ZeroMask;
10855 continue;
10856 }
10857
10858 // We can only use a single input of V1 or V2.
10859 SDValue SrcV = (M >= Size ? V2 : V1);
10860 if (V && V != SrcV)
10861 return SDValue();
10862 V = SrcV;
10863 M %= Size;
10864
10865 // PSHUFB can't cross lanes, ensure this doesn't happen.
10866 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10867 return SDValue();
10868
10869 M = M % LaneSize;
10870 M = M * NumEltBytes + (i % NumEltBytes);
10871 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10872 }
10873 assert(V && "Failed to find a source input")((V && "Failed to find a source input") ? static_cast
<void> (0) : __assert_fail ("V && \"Failed to find a source input\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10873, __PRETTY_FUNCTION__))
;
10874
10875 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10876 return DAG.getBitcast(
10877 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10878 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10879}
10880
10881static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10882 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10883 const SDLoc &dl);
10884
10885// X86 has dedicated shuffle that can be lowered to VEXPAND
10886static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
10887 const APInt &Zeroable,
10888 ArrayRef<int> Mask, SDValue &V1,
10889 SDValue &V2, SelectionDAG &DAG,
10890 const X86Subtarget &Subtarget) {
10891 bool IsLeftZeroSide = true;
10892 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10893 IsLeftZeroSide))
10894 return SDValue();
10895 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10896 MVT IntegerType =
10897 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10898 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10899 unsigned NumElts = VT.getVectorNumElements();
10900 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10901, __PRETTY_FUNCTION__))
10901 "Unexpected number of vector elements")(((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10901, __PRETTY_FUNCTION__))
;
10902 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10903 Subtarget, DAG, DL);
10904 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10905 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10906 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10907}
10908
10909static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10910 unsigned &UnpackOpcode, bool IsUnary,
10911 ArrayRef<int> TargetMask, const SDLoc &DL,
10912 SelectionDAG &DAG,
10913 const X86Subtarget &Subtarget) {
10914 int NumElts = VT.getVectorNumElements();
10915
10916 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10917 for (int i = 0; i != NumElts; i += 2) {
10918 int M1 = TargetMask[i + 0];
10919 int M2 = TargetMask[i + 1];
10920 Undef1 &= (SM_SentinelUndef == M1);
10921 Undef2 &= (SM_SentinelUndef == M2);
10922 Zero1 &= isUndefOrZero(M1);
10923 Zero2 &= isUndefOrZero(M2);
10924 }
10925 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&((!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected") ? static_cast<void> (0) :
__assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10926, __PRETTY_FUNCTION__))
10926 "Zeroable shuffle detected")((!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
"Zeroable shuffle detected") ? static_cast<void> (0) :
__assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 10926, __PRETTY_FUNCTION__))
;
10927
10928 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10929 SmallVector<int, 64> Unpckl, Unpckh;
10930 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10931 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
10932 UnpackOpcode = X86ISD::UNPCKL;
10933 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10934 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10935 return true;
10936 }
10937
10938 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10939 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
10940 UnpackOpcode = X86ISD::UNPCKH;
10941 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10942 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10943 return true;
10944 }
10945
10946 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10947 if (IsUnary && (Zero1 || Zero2)) {
10948 // Don't bother if we can blend instead.
10949 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10950 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10951 return false;
10952
10953 bool MatchLo = true, MatchHi = true;
10954 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10955 int M = TargetMask[i];
10956
10957 // Ignore if the input is known to be zero or the index is undef.
10958 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10959 (M == SM_SentinelUndef))
10960 continue;
10961
10962 MatchLo &= (M == Unpckl[i]);
10963 MatchHi &= (M == Unpckh[i]);
10964 }
10965
10966 if (MatchLo || MatchHi) {
10967 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10968 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10969 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10970 return true;
10971 }
10972 }
10973
10974 // If a binary shuffle, commute and try again.
10975 if (!IsUnary) {
10976 ShuffleVectorSDNode::commuteMask(Unpckl);
10977 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
10978 UnpackOpcode = X86ISD::UNPCKL;
10979 std::swap(V1, V2);
10980 return true;
10981 }
10982
10983 ShuffleVectorSDNode::commuteMask(Unpckh);
10984 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
10985 UnpackOpcode = X86ISD::UNPCKH;
10986 std::swap(V1, V2);
10987 return true;
10988 }
10989 }
10990
10991 return false;
10992}
10993
10994// X86 has dedicated unpack instructions that can handle specific blend
10995// operations: UNPCKH and UNPCKL.
10996static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
10997 ArrayRef<int> Mask, SDValue V1, SDValue V2,
10998 SelectionDAG &DAG) {
10999 SmallVector<int, 8> Unpckl;
11000 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
11001 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
11002 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
11003
11004 SmallVector<int, 8> Unpckh;
11005 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
11006 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
11007 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
11008
11009 // Commute and try again.
11010 ShuffleVectorSDNode::commuteMask(Unpckl);
11011 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
11012 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
11013
11014 ShuffleVectorSDNode::commuteMask(Unpckh);
11015 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
11016 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
11017
11018 return SDValue();
11019}
11020
11021/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
11022/// followed by unpack 256-bit.
11023static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
11024 ArrayRef<int> Mask, SDValue V1,
11025 SDValue V2, SelectionDAG &DAG) {
11026 SmallVector<int, 32> Unpckl, Unpckh;
11027 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
11028 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
11029
11030 unsigned UnpackOpcode;
11031 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
11032 UnpackOpcode = X86ISD::UNPCKL;
11033 else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
11034 UnpackOpcode = X86ISD::UNPCKH;
11035 else
11036 return SDValue();
11037
11038 // This is a "natural" unpack operation (rather than the 128-bit sectored
11039 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
11040 // input in order to use the x86 instruction.
11041 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
11042 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11043 V1 = DAG.getBitcast(VT, V1);
11044 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
11045}
11046
11047static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
11048 int Delta) {
11049 int Size = (int)Mask.size();
11050 int Split = Size / Delta;
11051 int TruncatedVectorStart = SwappedOps ? Size : 0;
11052
11053 // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
11054 if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
11055 return false;
11056
11057 // The rest of the mask should not refer to the truncated vector's elements.
11058 if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
11059 TruncatedVectorStart + Size))
11060 return false;
11061
11062 return true;
11063}
11064
11065// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
11066//
11067// An example is the following:
11068//
11069// t0: ch = EntryToken
11070// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
11071// t25: v4i32 = truncate t2
11072// t41: v8i16 = bitcast t25
11073// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
11074// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
11075// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
11076// t18: v2i64 = bitcast t51
11077//
11078// Without avx512vl, this is lowered to:
11079//
11080// vpmovqd %zmm0, %ymm0
11081// vpshufb {{.*#+}} xmm0 =
11082// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
11083//
11084// But when avx512vl is available, one can just use a single vpmovdw
11085// instruction.
11086static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
11087 MVT VT, SDValue V1, SDValue V2,
11088 SelectionDAG &DAG,
11089 const X86Subtarget &Subtarget) {
11090 if (VT != MVT::v16i8 && VT != MVT::v8i16)
11091 return SDValue();
11092
11093 if (Mask.size() != VT.getVectorNumElements())
11094 return SDValue();
11095
11096 bool SwappedOps = false;
11097
11098 if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
11099 if (!ISD::isBuildVectorAllZeros(V1.getNode()))
11100 return SDValue();
11101
11102 std::swap(V1, V2);
11103 SwappedOps = true;
11104 }
11105
11106 // Look for:
11107 //
11108 // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
11109 // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
11110 //
11111 // and similar ones.
11112 if (V1.getOpcode() != ISD::BITCAST)
11113 return SDValue();
11114 if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
11115 return SDValue();
11116
11117 SDValue Src = V1.getOperand(0).getOperand(0);
11118 MVT SrcVT = Src.getSimpleValueType();
11119
11120 // The vptrunc** instructions truncating 128 bit and 256 bit vectors
11121 // are only available with avx512vl.
11122 if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
11123 return SDValue();
11124
11125 // Down Convert Word to Byte is only available with avx512bw. The case with
11126 // 256-bit output doesn't contain a shuffle and is therefore not handled here.
11127 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
11128 !Subtarget.hasBWI())
11129 return SDValue();
11130
11131 // The first half/quarter of the mask should refer to every second/fourth
11132 // element of the vector truncated and bitcasted.
11133 if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
11134 !matchShuffleAsVPMOV(Mask, SwappedOps, 4))
11135 return SDValue();
11136
11137 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
11138}
11139
11140// X86 has dedicated pack instructions that can handle specific truncation
11141// operations: PACKSS and PACKUS.
11142static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
11143 unsigned &PackOpcode, ArrayRef<int> TargetMask,
11144 SelectionDAG &DAG,
11145 const X86Subtarget &Subtarget) {
11146 unsigned NumElts = VT.getVectorNumElements();
11147 unsigned BitSize = VT.getScalarSizeInBits();
11148 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
11149 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
11150
11151 auto MatchPACK = [&](SDValue N1, SDValue N2) {
11152 SDValue VV1 = DAG.getBitcast(PackVT, N1);
11153 SDValue VV2 = DAG.getBitcast(PackVT, N2);
11154 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
11155 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
11156 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
11157 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
11158 V1 = VV1;
11159 V2 = VV2;
11160 SrcVT = PackVT;
11161 PackOpcode = X86ISD::PACKUS;
11162 return true;
11163 }
11164 }
11165 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
11166 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
11167 V1 = VV1;
11168 V2 = VV2;
11169 SrcVT = PackVT;
11170 PackOpcode = X86ISD::PACKSS;
11171 return true;
11172 }
11173 return false;
11174 };
11175
11176 // Try binary shuffle.
11177 SmallVector<int, 32> BinaryMask;
11178 createPackShuffleMask(VT, BinaryMask, false);
11179 if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
11180 if (MatchPACK(V1, V2))
11181 return true;
11182
11183 // Try unary shuffle.
11184 SmallVector<int, 32> UnaryMask;
11185 createPackShuffleMask(VT, UnaryMask, true);
11186 if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
11187 if (MatchPACK(V1, V1))
11188 return true;
11189
11190 return false;
11191}
11192
11193static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
11194 SDValue V1, SDValue V2, SelectionDAG &DAG,
11195 const X86Subtarget &Subtarget) {
11196 MVT PackVT;
11197 unsigned PackOpcode;
11198 if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
11199 Subtarget))
11200 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
11201 DAG.getBitcast(PackVT, V2));
11202
11203 return SDValue();
11204}
11205
11206/// Try to emit a bitmask instruction for a shuffle.
11207///
11208/// This handles cases where we can model a blend exactly as a bitmask due to
11209/// one of the inputs being zeroable.
11210static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
11211 SDValue V2, ArrayRef<int> Mask,
11212 const APInt &Zeroable,
11213 const X86Subtarget &Subtarget,
11214 SelectionDAG &DAG) {
11215 MVT MaskVT = VT;
11216 MVT EltVT = VT.getVectorElementType();
11217 SDValue Zero, AllOnes;
11218 // Use f64 if i64 isn't legal.
11219 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11220 EltVT = MVT::f64;
11221 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11222 }
11223
11224 MVT LogicVT = VT;
11225 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
11226 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11227 AllOnes = DAG.getConstantFP(
11228 APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
11229 LogicVT =
11230 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
11231 } else {
11232 Zero = DAG.getConstant(0, DL, EltVT);
11233 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11234 }
11235
11236 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11237 SDValue V;
11238 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11239 if (Zeroable[i])
11240 continue;
11241 if (Mask[i] % Size != i)
11242 return SDValue(); // Not a blend.
11243 if (!V)
11244 V = Mask[i] < Size ? V1 : V2;
11245 else if (V != (Mask[i] < Size ? V1 : V2))
11246 return SDValue(); // Can only let one input through the mask.
11247
11248 VMaskOps[i] = AllOnes;
11249 }
11250 if (!V)
11251 return SDValue(); // No non-zeroable elements!
11252
11253 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11254 VMask = DAG.getBitcast(LogicVT, VMask);
11255 V = DAG.getBitcast(LogicVT, V);
11256 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11257 return DAG.getBitcast(VT, And);
11258}
11259
11260/// Try to emit a blend instruction for a shuffle using bit math.
11261///
11262/// This is used as a fallback approach when first class blend instructions are
11263/// unavailable. Currently it is only suitable for integer vectors, but could
11264/// be generalized for floating point vectors if desirable.
11265static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
11266 SDValue V2, ArrayRef<int> Mask,
11267 SelectionDAG &DAG) {
11268 assert(VT.isInteger() && "Only supports integer vector types!")((VT.isInteger() && "Only supports integer vector types!"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11268, __PRETTY_FUNCTION__))
;
11269 MVT EltVT = VT.getVectorElementType();
11270 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11271 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11272 SmallVector<SDValue, 16> MaskOps;
11273 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11274 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11275 return SDValue(); // Shuffled input!
11276 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11277 }
11278
11279 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11280 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
11281 V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
11282 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
11283}
11284
11285static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
11286 SDValue PreservedSrc,
11287 const X86Subtarget &Subtarget,
11288 SelectionDAG &DAG);
11289
11290static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
11291 MutableArrayRef<int> Mask,
11292 const APInt &Zeroable, bool &ForceV1Zero,
11293 bool &ForceV2Zero, uint64_t &BlendMask) {
11294 bool V1IsZeroOrUndef =
11295 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
11296 bool V2IsZeroOrUndef =
11297 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
11298
11299 BlendMask = 0;
11300 ForceV1Zero = false, ForceV2Zero = false;
11301 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")((Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11301, __PRETTY_FUNCTION__))
;
11302
11303 // Attempt to generate the binary blend mask. If an input is zero then
11304 // we can use any lane.
11305 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11306 int M = Mask[i];
11307 if (M == SM_SentinelUndef)
11308 continue;
11309 if (M == i)
11310 continue;
11311 if (M == i + Size) {
11312 BlendMask |= 1ull << i;
11313 continue;
11314 }
11315 if (Zeroable[i]) {
11316 if (V1IsZeroOrUndef) {
11317 ForceV1Zero = true;
11318 Mask[i] = i;
11319 continue;
11320 }
11321 if (V2IsZeroOrUndef) {
11322 ForceV2Zero = true;
11323 BlendMask |= 1ull << i;
11324 Mask[i] = i + Size;
11325 continue;
11326 }
11327 }
11328 return false;
11329 }
11330 return true;
11331}
11332
11333static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
11334 int Scale) {
11335 uint64_t ScaledMask = 0;
11336 for (int i = 0; i != Size; ++i)
11337 if (BlendMask & (1ull << i))
11338 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
11339 return ScaledMask;
11340}
11341
11342/// Try to emit a blend instruction for a shuffle.
11343///
11344/// This doesn't do any checks for the availability of instructions for blending
11345/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11346/// be matched in the backend with the type given. What it does check for is
11347/// that the shuffle mask is a blend, or convertible into a blend with zero.
11348static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
11349 SDValue V2, ArrayRef<int> Original,
11350 const APInt &Zeroable,
11351 const X86Subtarget &Subtarget,
11352 SelectionDAG &DAG) {
11353 uint64_t BlendMask = 0;
11354 bool ForceV1Zero = false, ForceV2Zero = false;
11355 SmallVector<int, 64> Mask(Original.begin(), Original.end());
11356 if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11357 BlendMask))
11358 return SDValue();
11359
11360 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11361 if (ForceV1Zero)
11362 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11363 if (ForceV2Zero)
11364 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11365
11366 switch (VT.SimpleTy) {
11367 case MVT::v4i64:
11368 case MVT::v8i32:
11369 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")((Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11369, __PRETTY_FUNCTION__))
;
11370 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11371 case MVT::v4f64:
11372 case MVT::v8f32:
11373 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")((Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11373, __PRETTY_FUNCTION__))
;
11374 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11375 case MVT::v2f64:
11376 case MVT::v2i64:
11377 case MVT::v4f32:
11378 case MVT::v4i32:
11379 case MVT::v8i16:
11380 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")((Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11380, __PRETTY_FUNCTION__))
;
11381 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11382 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11383 case MVT::v16i16: {
11384 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")((Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11384, __PRETTY_FUNCTION__))
;
11385 SmallVector<int, 8> RepeatedMask;
11386 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11387 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11388 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11388, __PRETTY_FUNCTION__))
;
11389 BlendMask = 0;
11390 for (int i = 0; i < 8; ++i)
11391 if (RepeatedMask[i] >= 8)
11392 BlendMask |= 1ull << i;
11393 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11394 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11395 }
11396 // Use PBLENDW for lower/upper lanes and then blend lanes.
11397 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11398 // merge to VSELECT where useful.
11399 uint64_t LoMask = BlendMask & 0xFF;
11400 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11401 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11402 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11403 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11404 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11405 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11406 return DAG.getVectorShuffle(
11407 MVT::v16i16, DL, Lo, Hi,
11408 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11409 }
11410 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11411 }
11412 case MVT::v32i8:
11413 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")((Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11413, __PRETTY_FUNCTION__))
;
11414 LLVM_FALLTHROUGH[[gnu::fallthrough]];
11415 case MVT::v16i8: {
11416 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")((Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11416, __PRETTY_FUNCTION__))
;
11417
11418 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11419 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11420 Subtarget, DAG))
11421 return Masked;
11422
11423 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11424 MVT IntegerType =
11425 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11426 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11427 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11428 }
11429
11430 // Scale the blend by the number of bytes per element.
11431 int Scale = VT.getScalarSizeInBits() / 8;
11432
11433 // This form of blend is always done on bytes. Compute the byte vector
11434 // type.
11435 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11436
11437 // x86 allows load folding with blendvb from the 2nd source operand. But
11438 // we are still using LLVM select here (see comment below), so that's V1.
11439 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11440 // allow that load-folding possibility.
11441 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11442 ShuffleVectorSDNode::commuteMask(Mask);
11443 std::swap(V1, V2);
11444 }
11445
11446 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11447 // mix of LLVM's code generator and the x86 backend. We tell the code
11448 // generator that boolean values in the elements of an x86 vector register
11449 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11450 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11451 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11452 // of the element (the remaining are ignored) and 0 in that high bit would
11453 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11454 // the LLVM model for boolean values in vector elements gets the relevant
11455 // bit set, it is set backwards and over constrained relative to x86's
11456 // actual model.
11457 SmallVector<SDValue, 32> VSELECTMask;
11458 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11459 for (int j = 0; j < Scale; ++j)
11460 VSELECTMask.push_back(
11461 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
11462 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
11463 MVT::i8));
11464
11465 V1 = DAG.getBitcast(BlendVT, V1);
11466 V2 = DAG.getBitcast(BlendVT, V2);
11467 return DAG.getBitcast(
11468 VT,
11469 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11470 V1, V2));
11471 }
11472 case MVT::v16f32:
11473 case MVT::v8f64:
11474 case MVT::v8i64:
11475 case MVT::v16i32:
11476 case MVT::v32i16:
11477 case MVT::v64i8: {
11478 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11479 bool OptForSize = DAG.shouldOptForSize();
11480 if (!OptForSize) {
11481 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11482 Subtarget, DAG))
11483 return Masked;
11484 }
11485
11486 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11487 // masked move.
11488 MVT IntegerType =
11489 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
11490 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11491 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11492 }
11493 default:
11494 llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11494)
;
11495 }
11496}
11497
11498/// Try to lower as a blend of elements from two inputs followed by
11499/// a single-input permutation.
11500///
11501/// This matches the pattern where we can blend elements from two inputs and
11502/// then reduce the shuffle to a single-input permutation.
11503static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
11504 SDValue V1, SDValue V2,
11505 ArrayRef<int> Mask,
11506 SelectionDAG &DAG,
11507 bool ImmBlends = false) {
11508 // We build up the blend mask while checking whether a blend is a viable way
11509 // to reduce the shuffle.
11510 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11511 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11512
11513 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11514 if (Mask[i] < 0)
11515 continue;
11516
11517 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")((Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11517, __PRETTY_FUNCTION__))
;
11518
11519 if (BlendMask[Mask[i] % Size] < 0)
11520 BlendMask[Mask[i] % Size] = Mask[i];
11521 else if (BlendMask[Mask[i] % Size] != Mask[i])
11522 return SDValue(); // Can't blend in the needed input!
11523
11524 PermuteMask[i] = Mask[i] % Size;
11525 }
11526
11527 // If only immediate blends, then bail if the blend mask can't be widened to
11528 // i16.
11529 unsigned EltSize = VT.getScalarSizeInBits();
11530 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11531 return SDValue();
11532
11533 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11534 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11535}
11536
11537/// Try to lower as an unpack of elements from two inputs followed by
11538/// a single-input permutation.
11539///
11540/// This matches the pattern where we can unpack elements from two inputs and
11541/// then reduce the shuffle to a single-input (wider) permutation.
11542static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
11543 SDValue V1, SDValue V2,
11544 ArrayRef<int> Mask,
11545 SelectionDAG &DAG) {
11546 int NumElts = Mask.size();
11547 int NumLanes = VT.getSizeInBits() / 128;
11548 int NumLaneElts = NumElts / NumLanes;
11549 int NumHalfLaneElts = NumLaneElts / 2;
11550
11551 bool MatchLo = true, MatchHi = true;
11552 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11553
11554 // Determine UNPCKL/UNPCKH type and operand order.
11555 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11556 for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
11557 int M = Mask[Lane + Elt];
11558 if (M < 0)
11559 continue;
11560
11561 SDValue &Op = Ops[Elt & 1];
11562 if (M < NumElts && (Op.isUndef() || Op == V1))
11563 Op = V1;
11564 else if (NumElts <= M && (Op.isUndef() || Op == V2))
11565 Op = V2;
11566 else
11567 return SDValue();
11568
11569 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11570 MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
11571 isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
11572 MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
11573 isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
11574 if (!MatchLo && !MatchHi)
11575 return SDValue();
11576 }
11577 }
11578 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? static_cast<void> (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11578, __PRETTY_FUNCTION__))
;
11579
11580 // Now check that each pair of elts come from the same unpack pair
11581 // and set the permute mask based on each pair.
11582 // TODO - Investigate cases where we permute individual elements.
11583 SmallVector<int, 32> PermuteMask(NumElts, -1);
11584 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11585 for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
11586 int M0 = Mask[Lane + Elt + 0];
11587 int M1 = Mask[Lane + Elt + 1];
11588 if (0 <= M0 && 0 <= M1 &&
11589 (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
11590 return SDValue();
11591 if (0 <= M0)
11592 PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
11593 if (0 <= M1)
11594 PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
11595 }
11596 }
11597
11598 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11599 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11600 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11601}
11602
11603/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11604/// permuting the elements of the result in place.
11605static SDValue lowerShuffleAsByteRotateAndPermute(
11606 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11607 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11608 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11609 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11610 (VT.is512BitVector() && !Subtarget.hasBWI()))
11611 return SDValue();
11612
11613 // We don't currently support lane crossing permutes.
11614 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11615 return SDValue();
11616
11617 int Scale = VT.getScalarSizeInBits() / 8;
11618 int NumLanes = VT.getSizeInBits() / 128;
11619 int NumElts = VT.getVectorNumElements();
11620 int NumEltsPerLane = NumElts / NumLanes;
11621
11622 // Determine range of mask elts.
11623 bool Blend1 = true;
11624 bool Blend2 = true;
11625 std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
11626 std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));
11627 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11628 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11629 int M = Mask[Lane + Elt];
11630 if (M < 0)
11631 continue;
11632 if (M < NumElts) {
11633 Blend1 &= (M == (Lane + Elt));
11634 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask") ? static_cast<void> (0) : __assert_fail
("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11634, __PRETTY_FUNCTION__))
;
11635 M = M % NumEltsPerLane;
11636 Range1.first = std::min(Range1.first, M);
11637 Range1.second = std::max(Range1.second, M);
11638 } else {
11639 M -= NumElts;
11640 Blend2 &= (M == (Lane + Elt));
11641 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")((Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask") ? static_cast<void> (0) : __assert_fail
("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11641, __PRETTY_FUNCTION__))
;
11642 M = M % NumEltsPerLane;
11643 Range2.first = std::min(Range2.first, M);
11644 Range2.second = std::max(Range2.second, M);
11645 }
11646 }
11647 }
11648
11649 // Bail if we don't need both elements.
11650 // TODO - it might be worth doing this for unary shuffles if the permute
11651 // can be widened.
11652 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11653 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11654 return SDValue();
11655
11656 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11657 return SDValue();
11658
11659 // Rotate the 2 ops so we can access both ranges, then permute the result.
11660 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11661 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11662 SDValue Rotate = DAG.getBitcast(
11663 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11664 DAG.getBitcast(ByteVT, Lo),
11665 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11666 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11667 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11668 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11669 int M = Mask[Lane + Elt];
11670 if (M < 0)
11671 continue;
11672 if (M < NumElts)
11673 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11674 else
11675 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11676 }
11677 }
11678 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11679 };
11680
11681 // Check if the ranges are small enough to rotate from either direction.
11682 if (Range2.second < Range1.first)
11683 return RotateAndPermute(V1, V2, Range1.first, 0);
11684 if (Range1.second < Range2.first)
11685 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11686 return SDValue();
11687}
11688
11689/// Generic routine to decompose a shuffle and blend into independent
11690/// blends and permutes.
11691///
11692/// This matches the extremely common pattern for handling combined
11693/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11694/// operations. It will try to pick the best arrangement of shuffles and
11695/// blends.
11696static SDValue lowerShuffleAsDecomposedShuffleBlend(
11697 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11698 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11699 // Shuffle the input elements into the desired positions in V1 and V2 and
11700 // blend them together.
11701 SmallVector<int, 32> V1Mask(Mask.size(), -1);
11702 SmallVector<int, 32> V2Mask(Mask.size(), -1);
11703 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11704 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11705 if (Mask[i] >= 0 && Mask[i] < Size) {
11706 V1Mask[i] = Mask[i];
11707 BlendMask[i] = i;
11708 } else if (Mask[i] >= Size) {
11709 V2Mask[i] = Mask[i] - Size;
11710 BlendMask[i] = i + Size;
11711 }
11712
11713 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11714 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11715 // the shuffle may be able to fold with a load or other benefit. However, when
11716 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11717 // pre-shuffle first is a better strategy.
11718 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11719 // Only prefer immediate blends to unpack/rotate.
11720 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11721 DAG, true))
11722 return BlendPerm;
11723 if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,
11724 DAG))
11725 return UnpackPerm;
11726 if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(
11727 DL, VT, V1, V2, Mask, Subtarget, DAG))
11728 return RotatePerm;
11729 // Unpack/rotate failed - try again with variable blends.
11730 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11731 DAG))
11732 return BlendPerm;
11733 }
11734
11735 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11736 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11737 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11738}
11739
11740/// Try to lower a vector shuffle as a bit rotation.
11741///
11742/// Look for a repeated rotation pattern in each sub group.
11743/// Returns a ISD::ROTL element rotation amount or -1 if failed.
11744static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
11745 int NumElts = Mask.size();
11746 assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(((NumElts % NumSubElts) == 0 && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11746, __PRETTY_FUNCTION__))
;
11747
11748 int RotateAmt = -1;
11749 for (int i = 0; i != NumElts; i += NumSubElts) {
11750 for (int j = 0; j != NumSubElts; ++j) {
11751 int M = Mask[i + j];
11752 if (M < 0)
11753 continue;
11754 if (!isInRange(M, i, i + NumSubElts))
11755 return -1;
11756 int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
11757 if (0 <= RotateAmt && Offset != RotateAmt)
11758 return -1;
11759 RotateAmt = Offset;
11760 }
11761 }
11762 return RotateAmt;
11763}
11764
11765static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11766 const X86Subtarget &Subtarget,
11767 ArrayRef<int> Mask) {
11768 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11768, __PRETTY_FUNCTION__))
;
11769 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")((EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? static_cast<void> (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11769, __PRETTY_FUNCTION__))
;
11770
11771 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11772 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11773 int MaxSubElts = 64 / EltSizeInBits;
11774 for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
11775 int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
11776 if (RotateAmt < 0)
11777 continue;
11778
11779 int NumElts = Mask.size();
11780 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11781 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11782 return RotateAmt * EltSizeInBits;
11783 }
11784
11785 return -1;
11786}
11787
11788/// Lower shuffle using X86ISD::VROTLI rotations.
11789static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
11790 ArrayRef<int> Mask,
11791 const X86Subtarget &Subtarget,
11792 SelectionDAG &DAG) {
11793 // Only XOP + AVX512 targets have bit rotation instructions.
11794 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11795 bool IsLegal =
11796 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11797 if (!IsLegal && Subtarget.hasSSE3())
11798 return SDValue();
11799
11800 MVT RotateVT;
11801 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11802 Subtarget, Mask);
11803 if (RotateAmt < 0)
11804 return SDValue();
11805
11806 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11807 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11808 // widen to vXi16 or more then existing lowering should will be better.
11809 if (!IsLegal) {
11810 if ((RotateAmt % 16) == 0)
11811 return SDValue();
11812 // TODO: Use getTargetVShiftByConstNode.
11813 unsigned ShlAmt = RotateAmt;
11814 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11815 V1 = DAG.getBitcast(RotateVT, V1);
11816 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11817 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11818 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11819 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11820 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11821 return DAG.getBitcast(VT, Rot);
11822 }
11823
11824 SDValue Rot =
11825 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11826 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11827 return DAG.getBitcast(VT, Rot);
11828}
11829
11830/// Try to lower a vector shuffle as a byte rotation.
11831///
11832/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11833static int matchShuffleAsByteRotate(SDValue &V1, SDValue &V2,
11834 ArrayRef<int> Mask) {
11835 int NumElts = Mask.size();
11836
11837 // We need to detect various ways of spelling a rotation:
11838 // [11, 12, 13, 14, 15, 0, 1, 2]
11839 // [-1, 12, 13, 14, -1, -1, 1, -1]
11840 // [-1, -1, -1, -1, -1, -1, 1, 2]
11841 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11842 // [-1, 4, 5, 6, -1, -1, 9, -1]
11843 // [-1, 4, 5, 6, -1, -1, -1, -1]
11844 int Rotation = 0;
11845 SDValue Lo, Hi;
11846 for (int i = 0; i < NumElts; ++i) {
11847 int M = Mask[i];
11848 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts
))) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11849, __PRETTY_FUNCTION__))
11849 "Unexpected mask index.")(((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts
))) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11849, __PRETTY_FUNCTION__))
;
11850 if (M < 0)
11851 continue;
11852
11853 // Determine where a rotated vector would have started.
11854 int StartIdx = i - (M % NumElts);
11855 if (StartIdx == 0)
11856 // The identity rotation isn't interesting, stop.
11857 return -1;
11858
11859 // If we found the tail of a vector the rotation must be the missing
11860 // front. If we found the head of a vector, it must be how much of the
11861 // head.
11862 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11863
11864 if (Rotation == 0)
11865 Rotation = CandidateRotation;
11866 else if (Rotation != CandidateRotation)
11867 // The rotations don't match, so we can't match this mask.
11868 return -1;
11869
11870 // Compute which value this mask is pointing at.
11871 SDValue MaskV = M < NumElts ? V1 : V2;
11872
11873 // Compute which of the two target values this index should be assigned
11874 // to. This reflects whether the high elements are remaining or the low
11875 // elements are remaining.
11876 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11877
11878 // Either set up this value if we've not encountered it before, or check
11879 // that it remains consistent.
11880 if (!TargetV)
11881 TargetV = MaskV;
11882 else if (TargetV != MaskV)
11883 // This may be a rotation, but it pulls from the inputs in some
11884 // unsupported interleaving.
11885 return -1;
11886 }
11887
11888 // Check that we successfully analyzed the mask, and normalize the results.
11889 assert(Rotation != 0 && "Failed to locate a viable rotation!")((Rotation != 0 && "Failed to locate a viable rotation!"
) ? static_cast<void> (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11889, __PRETTY_FUNCTION__))
;
11890 assert((Lo || Hi) && "Failed to find a rotated input vector!")(((Lo || Hi) && "Failed to find a rotated input vector!"
) ? static_cast<void> (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11890, __PRETTY_FUNCTION__))
;
11891 if (!Lo)
11892 Lo = Hi;
11893 else if (!Hi)
11894 Hi = Lo;
11895
11896 V1 = Lo;
11897 V2 = Hi;
11898
11899 return Rotation;
11900}
11901
11902/// Try to lower a vector shuffle as a byte rotation.
11903///
11904/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11905/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11906/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11907/// try to generically lower a vector shuffle through such an pattern. It
11908/// does not check for the profitability of lowering either as PALIGNR or
11909/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11910/// This matches shuffle vectors that look like:
11911///
11912/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11913///
11914/// Essentially it concatenates V1 and V2, shifts right by some number of
11915/// elements, and takes the low elements as the result. Note that while this is
11916/// specified as a *right shift* because x86 is little-endian, it is a *left
11917/// rotate* of the vector lanes.
11918static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
11919 ArrayRef<int> Mask) {
11920 // Don't accept any shuffles with zero elements.
11921 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
11922 return -1;
11923
11924 // PALIGNR works on 128-bit lanes.
11925 SmallVector<int, 16> RepeatedMask;
11926 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11927 return -1;
11928
11929 int Rotation = matchShuffleAsByteRotate(V1, V2, RepeatedMask);
11930 if (Rotation <= 0)
11931 return -1;
11932
11933 // PALIGNR rotates bytes, so we need to scale the
11934 // rotation based on how many bytes are in the vector lane.
11935 int NumElts = RepeatedMask.size();
11936 int Scale = 16 / NumElts;
11937 return Rotation * Scale;
11938}
11939
11940static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
11941 SDValue V2, ArrayRef<int> Mask,
11942 const X86Subtarget &Subtarget,
11943 SelectionDAG &DAG) {
11944 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11944, __PRETTY_FUNCTION__))
;
11945
11946 SDValue Lo = V1, Hi = V2;
11947 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11948 if (ByteRotation <= 0)
11949 return SDValue();
11950
11951 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11952 // PSLLDQ/PSRLDQ.
11953 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11954 Lo = DAG.getBitcast(ByteVT, Lo);
11955 Hi = DAG.getBitcast(ByteVT, Hi);
11956
11957 // SSSE3 targets can use the palignr instruction.
11958 if (Subtarget.hasSSSE3()) {
11959 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11960, __PRETTY_FUNCTION__))
11960 "512-bit PALIGNR requires BWI instructions")(((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? static_cast<void> (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11960, __PRETTY_FUNCTION__))
;
11961 return DAG.getBitcast(
11962 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11963 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11964 }
11965
11966 assert(VT.is128BitVector() &&((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11967, __PRETTY_FUNCTION__))
11967 "Rotate-based lowering only supports 128-bit lowering!")((VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11967, __PRETTY_FUNCTION__))
;
11968 assert(Mask.size() <= 16 &&((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11969, __PRETTY_FUNCTION__))
11969 "Can shuffle at most 16 bytes in a 128-bit vector!")((Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11969, __PRETTY_FUNCTION__))
;
11970 assert(ByteVT == MVT::v16i8 &&((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11971, __PRETTY_FUNCTION__))
11971 "SSE2 rotate lowering only needed for v16i8!")((ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? static_cast<void> (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11971, __PRETTY_FUNCTION__))
;
11972
11973 // Default SSE2 implementation
11974 int LoByteShift = 16 - ByteRotation;
11975 int HiByteShift = ByteRotation;
11976
11977 SDValue LoShift =
11978 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11979 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11980 SDValue HiShift =
11981 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11982 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11983 return DAG.getBitcast(VT,
11984 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11985}
11986
11987/// Try to lower a vector shuffle as a dword/qword rotation.
11988///
11989/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11990/// rotation of the concatenation of two vectors; This routine will
11991/// try to generically lower a vector shuffle through such an pattern.
11992///
11993/// Essentially it concatenates V1 and V2, shifts right by some number of
11994/// elements, and takes the low elements as the result. Note that while this is
11995/// specified as a *right shift* because x86 is little-endian, it is a *left
11996/// rotate* of the vector lanes.
11997static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
11998 SDValue V2, ArrayRef<int> Mask,
11999 const X86Subtarget &Subtarget,
12000 SelectionDAG &DAG) {
12001 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12002, __PRETTY_FUNCTION__))
12002 "Only 32-bit and 64-bit elements are supported!")(((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT
::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? static_cast<void> (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12002, __PRETTY_FUNCTION__))
;
12003
12004 // 128/256-bit vectors are only supported with VLX.
12005 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT
.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12006, __PRETTY_FUNCTION__))
12006 && "VLX required for 128/256-bit vectors")(((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT
.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12006, __PRETTY_FUNCTION__))
;
12007
12008 SDValue Lo = V1, Hi = V2;
12009 int Rotation = matchShuffleAsByteRotate(Lo, Hi, Mask);
12010 if (Rotation <= 0)
12011 return SDValue();
12012
12013 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12014 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12015}
12016
12017/// Try to lower a vector shuffle as a byte shift sequence.
12018static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
12019 SDValue V2, ArrayRef<int> Mask,
12020 const APInt &Zeroable,
12021 const X86Subtarget &Subtarget,
12022 SelectionDAG &DAG) {
12023 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")((!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"
) ? static_cast<void> (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12023, __PRETTY_FUNCTION__))
;
12024 assert(VT.is128BitVector() && "Only 128-bit vectors supported")((VT.is128BitVector() && "Only 128-bit vectors supported"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12024, __PRETTY_FUNCTION__))
;
12025
12026 // We need a shuffle that has zeros at one/both ends and a sequential
12027 // shuffle from one source within.
12028 unsigned ZeroLo = Zeroable.countTrailingOnes();
12029 unsigned ZeroHi = Zeroable.countLeadingOnes();
12030 if (!ZeroLo && !ZeroHi)
12031 return SDValue();
12032
12033 unsigned NumElts = Mask.size();
12034 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12035 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12036 return SDValue();
12037
12038 unsigned Scale = VT.getScalarSizeInBits() / 8;
12039 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12040 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12041 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12042 return SDValue();
12043
12044 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12045 Res = DAG.getBitcast(MVT::v16i8, Res);
12046
12047 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12048 // inner sequential set of elements, possibly offset:
12049 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12050 // 01234567 --> 4567zzzz --> zzzzz456
12051 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12052 if (ZeroLo == 0) {
12053 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12054 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12055 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12056 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12057 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12058 } else if (ZeroHi == 0) {
12059 unsigned Shift = Mask[ZeroLo] % NumElts;
12060 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12061 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12062 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12063 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12064 } else if (!Subtarget.hasSSSE3()) {
12065 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12066 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12067 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12068 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12069 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12070 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12071 Shift += Mask[ZeroLo] % NumElts;
12072 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12073 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12074 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12075 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12076 } else
12077 return SDValue();
12078
12079 return DAG.getBitcast(VT, Res);
12080}
12081
12082/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12083///
12084/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12085/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12086/// matches elements from one of the input vectors shuffled to the left or
12087/// right with zeroable elements 'shifted in'. It handles both the strictly
12088/// bit-wise element shifts and the byte shift across an entire 128-bit double
12089/// quad word lane.
12090///
12091/// PSHL : (little-endian) left bit shift.
12092/// [ zz, 0, zz, 2 ]
12093/// [ -1, 4, zz, -1 ]
12094/// PSRL : (little-endian) right bit shift.
12095/// [ 1, zz, 3, zz]
12096/// [ -1, -1, 7, zz]
12097/// PSLLDQ : (little-endian) left byte shift
12098/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12099/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12100/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12101/// PSRLDQ : (little-endian) right byte shift
12102/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12103/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12104/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12105static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12106 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12107 int MaskOffset, const APInt &Zeroable,
12108 const X86Subtarget &Subtarget) {
12109 int Size = Mask.size();
12110 unsigned SizeInBits = Size * ScalarSizeInBits;
12111
12112 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12113 for (int i = 0; i < Size; i += Scale)
12114 for (int j = 0; j < Shift; ++j)
12115 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12116 return false;
12117
12118 return true;
12119 };
12120
12121 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12122 for (int i = 0; i != Size; i += Scale) {
12123 unsigned Pos = Left ? i + Shift : i;
12124 unsigned Low = Left ? i : i + Shift;
12125 unsigned Len = Scale - Shift;
12126 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12127 return -1;
12128 }
12129
12130 int ShiftEltBits = ScalarSizeInBits * Scale;
12131 bool ByteShift = ShiftEltBits > 64;
12132 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12133 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12134 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12135
12136 // Normalize the scale for byte shifts to still produce an i64 element
12137 // type.
12138 Scale = ByteShift ? Scale / 2 : Scale;
12139
12140 // We need to round trip through the appropriate type for the shift.
12141 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12142 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12143 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12144 return (int)ShiftAmt;
12145 };
12146
12147 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12148 // keep doubling the size of the integer elements up to that. We can
12149 // then shift the elements of the integer vector by whole multiples of
12150 // their width within the elements of the larger integer vector. Test each
12151 // multiple to see if we can find a match with the moved element indices
12152 // and that the shifted in elements are all zeroable.
12153 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12154 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12155 for (int Shift = 1; Shift != Scale; ++Shift)
12156 for (bool Left : {true, false})
12157 if (CheckZeros(Shift, Scale, Left)) {
12158 int ShiftAmt = MatchShift(Shift, Scale, Left);
12159 if (0 < ShiftAmt)
12160 return ShiftAmt;
12161 }
12162
12163 // no match
12164 return -1;
12165}
12166
12167static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
12168 SDValue V2, ArrayRef<int> Mask,
12169 const APInt &Zeroable,
12170 const X86Subtarget &Subtarget,
12171 SelectionDAG &DAG) {
12172 int Size = Mask.size();
12173 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12173, __PRETTY_FUNCTION__))
;
12174
12175 MVT ShiftVT;
12176 SDValue V = V1;
12177 unsigned Opcode;
12178
12179 // Try to match shuffle against V1 shift.
12180 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12181 Mask, 0, Zeroable, Subtarget);
12182
12183 // If V1 failed, try to match shuffle against V2 shift.
12184 if (ShiftAmt < 0) {
12185 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12186 Mask, Size, Zeroable, Subtarget);
12187 V = V2;
12188 }
12189
12190 if (ShiftAmt < 0)
12191 return SDValue();
12192
12193 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type") ? static_cast<void> (0)
: __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12194, __PRETTY_FUNCTION__))
12194 "Illegal integer vector type")((DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
"Illegal integer vector type") ? static_cast<void> (0)
: __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12194, __PRETTY_FUNCTION__))
;
12195 V = DAG.getBitcast(ShiftVT, V);
12196 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12197 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12198 return DAG.getBitcast(VT, V);
12199}
12200
12201// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12202// Remainder of lower half result is zero and upper half is all undef.
12203static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12204 ArrayRef<int> Mask, uint64_t &BitLen,
12205 uint64_t &BitIdx, const APInt &Zeroable) {
12206 int Size = Mask.size();
12207 int HalfSize = Size / 2;
12208 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12208, __PRETTY_FUNCTION__))
;
12209 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask")((!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("!Zeroable.isAllOnesValue() && \"Fully zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12209, __PRETTY_FUNCTION__))
;
12210
12211 // Upper half must be undefined.
12212 if (!isUndefUpperHalf(Mask))
12213 return false;
12214
12215 // Determine the extraction length from the part of the
12216 // lower half that isn't zeroable.
12217 int Len = HalfSize;
12218 for (; Len > 0; --Len)
12219 if (!Zeroable[Len - 1])
12220 break;
12221 assert(Len > 0 && "Zeroable shuffle mask")((Len > 0 && "Zeroable shuffle mask") ? static_cast
<void> (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12221, __PRETTY_FUNCTION__))
;
12222
12223 // Attempt to match first Len sequential elements from the lower half.
12224 SDValue Src;
12225 int Idx = -1;
12226 for (int i = 0; i != Len; ++i) {
12227 int M = Mask[i];
12228 if (M == SM_SentinelUndef)
12229 continue;
12230 SDValue &V = (M < Size ? V1 : V2);
12231 M = M % Size;
12232
12233 // The extracted elements must start at a valid index and all mask
12234 // elements must be in the lower half.
12235 if (i > M || M >= HalfSize)
12236 return false;
12237
12238 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12239 Src = V;
12240 Idx = M - i;
12241 continue;
12242 }
12243 return false;
12244 }
12245
12246 if (!Src || Idx < 0)
12247 return false;
12248
12249 assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(((Idx + Len) <= HalfSize && "Illegal extraction mask"
) ? static_cast<void> (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12249, __PRETTY_FUNCTION__))
;
12250 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12251 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12252 V1 = Src;
12253 return true;
12254}
12255
12256// INSERTQ: Extract lowest Len elements from lower half of second source and
12257// insert over first source, starting at Idx.
12258// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12259static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12260 ArrayRef<int> Mask, uint64_t &BitLen,
12261 uint64_t &BitIdx) {
12262 int Size = Mask.size();
12263 int HalfSize = Size / 2;
12264 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")((Size == (int)VT.getVectorNumElements() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12264, __PRETTY_FUNCTION__))
;
12265
12266 // Upper half must be undefined.
12267 if (!isUndefUpperHalf(Mask))
12268 return false;
12269
12270 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12271 SDValue Base;
12272
12273 // Attempt to match first source from mask before insertion point.
12274 if (isUndefInRange(Mask, 0, Idx)) {
12275 /* EMPTY */
12276 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12277 Base = V1;
12278 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12279 Base = V2;
12280 } else {
12281 continue;
12282 }
12283
12284 // Extend the extraction length looking to match both the insertion of
12285 // the second source and the remaining elements of the first.
12286 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12287 SDValue Insert;
12288 int Len = Hi - Idx;
12289
12290 // Match insertion.
12291 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12292 Insert = V1;
12293 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12294 Insert = V2;
12295 } else {
12296 continue;
12297 }
12298
12299 // Match the remaining elements of the lower half.
12300 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12301 /* EMPTY */
12302 } else if ((!Base || (Base == V1)) &&
12303 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12304 Base = V1;
12305 } else if ((!Base || (Base == V2)) &&
12306 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12307 Size + Hi)) {
12308 Base = V2;
12309 } else {
12310 continue;
12311 }
12312
12313 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12314 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12315 V1 = Base;
12316 V2 = Insert;
12317 return true;
12318 }
12319 }
12320
12321 return false;
12322}
12323
12324/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12325static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
12326 SDValue V2, ArrayRef<int> Mask,
12327 const APInt &Zeroable, SelectionDAG &DAG) {
12328 uint64_t BitLen, BitIdx;
12329 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12330 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12331 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12332 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12333
12334 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12335 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12336 V2 ? V2 : DAG.getUNDEF(VT),
12337 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12338 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12339
12340 return SDValue();
12341}
12342
12343/// Lower a vector shuffle as a zero or any extension.
12344///
12345/// Given a specific number of elements, element bit width, and extension
12346/// stride, produce either a zero or any extension based on the available
12347/// features of the subtarget. The extended elements are consecutive and
12348/// begin and can start from an offsetted element index in the input; to
12349/// avoid excess shuffling the offset must either being in the bottom lane
12350/// or at the start of a higher lane. All extended elements must be from
12351/// the same lane.
12352static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
12353 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
12354 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12355 assert(Scale > 1 && "Need a scale to extend.")((Scale > 1 && "Need a scale to extend.") ? static_cast
<void> (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12355, __PRETTY_FUNCTION__))
;
12356 int EltBits = VT.getScalarSizeInBits();
12357 int NumElements = VT.getVectorNumElements();
12358 int NumEltsPerLane = 128 / EltBits;
12359 int OffsetLane = Offset / NumEltsPerLane;
12360 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12361, __PRETTY_FUNCTION__))
12361 "Only 8, 16, and 32 bit elements can be extended.")(((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Only 8, 16, and 32 bit elements can be extended.") ? static_cast
<void> (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12361, __PRETTY_FUNCTION__))
;
12362 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")((Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."
) ? static_cast<void> (0) : __assert_fail ("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12362, __PRETTY_FUNCTION__))
;
12363 assert(0 <= Offset && "Extension offset must be positive.")((0 <= Offset && "Extension offset must be positive."
) ? static_cast<void> (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12363, __PRETTY_FUNCTION__))
;
12364 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0
) && "Extension offset must be in the first lane or start an upper lane."
) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12365, __PRETTY_FUNCTION__))
12365 "Extension offset must be in the first lane or start an upper lane.")(((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0
) && "Extension offset must be in the first lane or start an upper lane."
) ? static_cast<void> (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12365, __PRETTY_FUNCTION__))
;
12366
12367 // Check that an index is in same lane as the base offset.
12368 auto SafeOffset = [&](int Idx) {
12369 return OffsetLane == (Idx / NumEltsPerLane);
12370 };
12371
12372 // Shift along an input so that the offset base moves to the first element.
12373 auto ShuffleOffset = [&](SDValue V) {
12374 if (!Offset)
12375 return V;
12376
12377 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12378 for (int i = 0; i * Scale < NumElements; ++i) {
12379 int SrcIdx = i + Offset;
12380 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12381 }
12382 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12383 };
12384
12385 // Found a valid a/zext mask! Try various lowering strategies based on the
12386 // input type and available ISA extensions.
12387 if (Subtarget.hasSSE41()) {
12388 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12389 // PUNPCK will catch this in a later shuffle match.
12390 if (Offset && Scale == 2 && VT.is128BitVector())
12391 return SDValue();
12392 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12393 NumElements / Scale);
12394 InputV = ShuffleOffset(InputV);
12395 InputV = getExtendInVec(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, DL,
12396 ExtVT, InputV, DAG);
12397 return DAG.getBitcast(VT, InputV);
12398 }
12399
12400 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")((VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12400, __PRETTY_FUNCTION__))
;
12401
12402 // For any extends we can cheat for larger element sizes and use shuffle
12403 // instructions that can fold with a load and/or copy.
12404 if (AnyExt && EltBits == 32) {
12405 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12406 -1};
12407 return DAG.getBitcast(
12408 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12409 DAG.getBitcast(MVT::v4i32, InputV),
12410 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12411 }
12412 if (AnyExt && EltBits == 16 && Scale > 2) {
12413 int PSHUFDMask[4] = {Offset / 2, -1,
12414 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12415 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12416 DAG.getBitcast(MVT::v4i32, InputV),
12417 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12418 int PSHUFWMask[4] = {1, -1, -1, -1};
12419 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12420 return DAG.getBitcast(
12421 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12422 DAG.getBitcast(MVT::v8i16, InputV),
12423 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12424 }
12425
12426 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12427 // to 64-bits.
12428 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12429 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")((NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12429, __PRETTY_FUNCTION__))
;
12430 assert(VT.is128BitVector() && "Unexpected vector width!")((VT.is128BitVector() && "Unexpected vector width!") ?
static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12430, __PRETTY_FUNCTION__))
;
12431
12432 int LoIdx = Offset * EltBits;
12433 SDValue Lo = DAG.getBitcast(
12434 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12435 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12436 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12437
12438 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12439 return DAG.getBitcast(VT, Lo);
12440
12441 int HiIdx = (Offset + 1) * EltBits;
12442 SDValue Hi = DAG.getBitcast(
12443 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12444 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12445 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12446 return DAG.getBitcast(VT,
12447 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12448 }
12449
12450 // If this would require more than 2 unpack instructions to expand, use
12451 // pshufb when available. We can only use more than 2 unpack instructions
12452 // when zero extending i8 elements which also makes it easier to use pshufb.
12453 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12454 assert(NumElements == 16 && "Unexpected byte vector width!")((NumElements == 16 && "Unexpected byte vector width!"
) ? static_cast<void> (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12454, __PRETTY_FUNCTION__))
;
12455 SDValue PSHUFBMask[16];
12456 for (int i = 0; i < 16; ++i) {
12457 int Idx = Offset + (i / Scale);
12458 if ((i % Scale == 0 && SafeOffset(Idx))) {
12459 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12460 continue;
12461 }
12462 PSHUFBMask[i] =
12463 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12464 }
12465 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12466 return DAG.getBitcast(
12467 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12468 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12469 }
12470
12471 // If we are extending from an offset, ensure we start on a boundary that
12472 // we can unpack from.
12473 int AlignToUnpack = Offset % (NumElements / Scale);
12474 if (AlignToUnpack) {
12475 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12476 for (int i = AlignToUnpack; i < NumElements; ++i)
12477 ShMask[i - AlignToUnpack] = i;
12478 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12479 Offset -= AlignToUnpack;
12480 }
12481
12482 // Otherwise emit a sequence of unpacks.
12483 do {
12484 unsigned UnpackLoHi = X86ISD::UNPCKL;
12485 if (Offset >= (NumElements / 2)) {
12486 UnpackLoHi = X86ISD::UNPCKH;
12487 Offset -= (NumElements / 2);
12488 }
12489
12490 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12491 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12492 : getZeroVector(InputVT, Subtarget, DAG, DL);
12493 InputV = DAG.getBitcast(InputVT, InputV);
12494 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12495 Scale /= 2;
12496 EltBits *= 2;
12497 NumElements /= 2;
12498 } while (Scale > 1);
12499 return DAG.getBitcast(VT, InputV);
12500}
12501
12502/// Try to lower a vector shuffle as a zero extension on any microarch.
12503///
12504/// This routine will try to do everything in its power to cleverly lower
12505/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12506/// check for the profitability of this lowering, it tries to aggressively
12507/// match this pattern. It will use all of the micro-architectural details it
12508/// can to emit an efficient lowering. It handles both blends with all-zero
12509/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12510/// masking out later).
12511///
12512/// The reason we have dedicated lowering for zext-style shuffles is that they
12513/// are both incredibly common and often quite performance sensitive.
12514static SDValue lowerShuffleAsZeroOrAnyExtend(
12515 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12516 const APInt &Zeroable, const X86Subtarget &Subtarget,
12517 SelectionDAG &DAG) {
12518 int Bits = VT.getSizeInBits();
12519 int NumLanes = Bits / 128;
12520 int NumElements = VT.getVectorNumElements();
12521 int NumEltsPerLane = NumElements / NumLanes;
12522 assert(VT.getScalarSizeInBits() <= 32 &&((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12523, __PRETTY_FUNCTION__))
12523 "Exceeds 32-bit integer zero extension limit")((VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12523, __PRETTY_FUNCTION__))
;
12524 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(((int)Mask.size() == NumElements && "Unexpected shuffle mask size"
) ? static_cast<void> (0) : __assert_fail ("(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12524, __PRETTY_FUNCTION__))
;
12525
12526 // Define a helper function to check a particular ext-scale and lower to it if
12527 // valid.
12528 auto Lower = [&](int Scale) -> SDValue {
12529 SDValue InputV;
12530 bool AnyExt = true;
12531 int Offset = 0;
12532 int Matches = 0;
12533 for (int i = 0; i < NumElements; ++i) {
12534 int M = Mask[i];
12535 if (M < 0)
12536 continue; // Valid anywhere but doesn't tell us anything.
12537 if (i % Scale != 0) {
12538 // Each of the extended elements need to be zeroable.
12539 if (!Zeroable[i])
12540 return SDValue();
12541
12542 // We no longer are in the anyext case.
12543 AnyExt = false;
12544 continue;
12545 }
12546
12547 // Each of the base elements needs to be consecutive indices into the
12548 // same input vector.
12549 SDValue V = M < NumElements ? V1 : V2;
12550 M = M % NumElements;
12551 if (!InputV) {
12552 InputV = V;
12553 Offset = M - (i / Scale);
12554 } else if (InputV != V)
12555 return SDValue(); // Flip-flopping inputs.
12556
12557 // Offset must start in the lowest 128-bit lane or at the start of an
12558 // upper lane.
12559 // FIXME: Is it ever worth allowing a negative base offset?
12560 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12561 (Offset % NumEltsPerLane) == 0))
12562 return SDValue();
12563
12564 // If we are offsetting, all referenced entries must come from the same
12565 // lane.
12566 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12567 return SDValue();
12568
12569 if ((M % NumElements) != (Offset + (i / Scale)))
12570 return SDValue(); // Non-consecutive strided elements.
12571 Matches++;
12572 }
12573
12574 // If we fail to find an input, we have a zero-shuffle which should always
12575 // have already been handled.
12576 // FIXME: Maybe handle this here in case during blending we end up with one?
12577 if (!InputV)
12578 return SDValue();
12579
12580 // If we are offsetting, don't extend if we only match a single input, we
12581 // can always do better by using a basic PSHUF or PUNPCK.
12582 if (Offset != 0 && Matches < 2)
12583 return SDValue();
12584
12585 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12586 InputV, Mask, Subtarget, DAG);
12587 };
12588
12589 // The widest scale possible for extending is to a 64-bit integer.
12590 assert(Bits % 64 == 0 &&((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12591, __PRETTY_FUNCTION__))
12591 "The number of bits in a vector must be divisible by 64 on x86!")((Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? static_cast<void> (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12591, __PRETTY_FUNCTION__))
;
12592 int NumExtElements = Bits / 64;
12593
12594 // Each iteration, try extending the elements half as much, but into twice as
12595 // many elements.
12596 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12597 assert(NumElements % NumExtElements == 0 &&((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12598, __PRETTY_FUNCTION__))
12598 "The input vector size must be divisible by the extended size.")((NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."
) ? static_cast<void> (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12598, __PRETTY_FUNCTION__))
;
12599 if (SDValue V = Lower(NumElements / NumExtElements))
12600 return V;
12601 }
12602
12603 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12604 if (Bits != 128)
12605 return SDValue();
12606
12607 // Returns one of the source operands if the shuffle can be reduced to a
12608 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12609 auto CanZExtLowHalf = [&]() {
12610 for (int i = NumElements / 2; i != NumElements; ++i)
12611 if (!Zeroable[i])
12612 return SDValue();
12613 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12614 return V1;
12615 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12616 return V2;
12617 return SDValue();
12618 };
12619
12620 if (SDValue V = CanZExtLowHalf()) {
12621 V = DAG.getBitcast(MVT::v2i64, V);
12622 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12623 return DAG.getBitcast(VT, V);
12624 }
12625
12626 // No viable ext lowering found.
12627 return SDValue();
12628}
12629
12630/// Try to get a scalar value for a specific element of a vector.
12631///
12632/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12633static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
12634 SelectionDAG &DAG) {
12635 MVT VT = V.getSimpleValueType();
12636 MVT EltVT = VT.getVectorElementType();
12637 V = peekThroughBitcasts(V);
12638
12639 // If the bitcasts shift the element size, we can't extract an equivalent
12640 // element from it.
12641 MVT NewVT = V.getSimpleValueType();
12642 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12643 return SDValue();
12644
12645 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12646 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12647 // Ensure the scalar operand is the same size as the destination.
12648 // FIXME: Add support for scalar truncation where possible.
12649 SDValue S = V.getOperand(Idx);
12650 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12651 return DAG.getBitcast(EltVT, S);
12652 }
12653
12654 return SDValue();
12655}
12656
12657/// Helper to test for a load that can be folded with x86 shuffles.
12658///
12659/// This is particularly important because the set of instructions varies
12660/// significantly based on whether the operand is a load or not.
12661static bool isShuffleFoldableLoad(SDValue V) {
12662 V = peekThroughBitcasts(V);
12663 return ISD::isNON_EXTLoad(V.getNode());
12664}
12665
12666/// Try to lower insertion of a single element into a zero vector.
12667///
12668/// This is a common pattern that we have especially efficient patterns to lower
12669/// across all subtarget feature sets.
12670static SDValue lowerShuffleAsElementInsertion(
12671 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12672 const APInt &Zeroable, const X86Subtarget &Subtarget,
12673 SelectionDAG &DAG) {
12674 MVT ExtVT = VT;
12675 MVT EltVT = VT.getVectorElementType();
12676
12677 int V2Index =
12678 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12679 Mask.begin();
12680 bool IsV1Zeroable = true;
12681 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12682 if (i != V2Index && !Zeroable[i]) {
12683 IsV1Zeroable = false;
12684 break;
12685 }
12686
12687 // Check for a single input from a SCALAR_TO_VECTOR node.
12688 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12689 // all the smarts here sunk into that routine. However, the current
12690 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12691 // vector shuffle lowering is dead.
12692 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12693 DAG);
12694 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12695 // We need to zext the scalar if it is smaller than an i32.
12696 V2S = DAG.getBitcast(EltVT, V2S);
12697 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
12698 // Using zext to expand a narrow element won't work for non-zero
12699 // insertions.
12700 if (!IsV1Zeroable)
12701 return SDValue();
12702
12703 // Zero-extend directly to i32.
12704 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12705 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12706 }
12707 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12708 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12709 EltVT == MVT::i16) {
12710 // Either not inserting from the low element of the input or the input
12711 // element size is too small to use VZEXT_MOVL to clear the high bits.
12712 return SDValue();
12713 }
12714
12715 if (!IsV1Zeroable) {
12716 // If V1 can't be treated as a zero vector we have fewer options to lower
12717 // this. We can't support integer vectors or non-zero targets cheaply, and
12718 // the V1 elements can't be permuted in any way.
12719 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")((VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? static_cast<void> (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12719, __PRETTY_FUNCTION__))
;
12720 if (!VT.isFloatingPoint() || V2Index != 0)
12721 return SDValue();
12722 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
12723 V1Mask[V2Index] = -1;
12724 if (!isNoopShuffleMask(V1Mask))
12725 return SDValue();
12726 if (!VT.is128BitVector())
12727 return SDValue();
12728
12729 // Otherwise, use MOVSD or MOVSS.
12730 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12731, __PRETTY_FUNCTION__))
12731 "Only two types of floating point element types to handle!")(((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::f32 || EltVT == MVT::f64) && \"Only two types of floating point element types to handle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12731, __PRETTY_FUNCTION__))
;
12732 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
12733 ExtVT, V1, V2);
12734 }
12735
12736 // This lowering only works for the low element with floating point vectors.
12737 if (VT.isFloatingPoint() && V2Index != 0)
12738 return SDValue();
12739
12740 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12741 if (ExtVT != VT)
12742 V2 = DAG.getBitcast(VT, V2);
12743
12744 if (V2Index != 0) {
12745 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12746 // the desired position. Otherwise it is more efficient to do a vector
12747 // shift left. We know that we can do a vector shift left because all
12748 // the inputs are zero.
12749 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
12750 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12751 V2Shuffle[V2Index] = 0;
12752 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12753 } else {
12754 V2 = DAG.getBitcast(MVT::v16i8, V2);
12755 V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12756 DAG.getTargetConstant(
12757 V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
12758 V2 = DAG.getBitcast(VT, V2);
12759 }
12760 }
12761 return V2;
12762}
12763
12764/// Try to lower broadcast of a single - truncated - integer element,
12765/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12766///
12767/// This assumes we have AVX2.
12768static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
12769 int BroadcastIdx,
12770 const X86Subtarget &Subtarget,
12771 SelectionDAG &DAG) {
12772 assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12773, __PRETTY_FUNCTION__))
12773 "We can only lower integer broadcasts with AVX2!")((Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12773, __PRETTY_FUNCTION__))
;
12774
12775 MVT EltVT = VT.getVectorElementType();
12776 MVT V0VT = V0.getSimpleValueType();
12777
12778 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")((VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12778, __PRETTY_FUNCTION__))
;
12779 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")((V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? static_cast<void> (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12779, __PRETTY_FUNCTION__))
;
12780
12781 MVT V0EltVT = V0VT.getVectorElementType();
12782 if (!V0EltVT.isInteger())
12783 return SDValue();
12784
12785 const unsigned EltSize = EltVT.getSizeInBits();
12786 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12787
12788 // This is only a truncation if the original element type is larger.
12789 if (V0EltSize <= EltSize)
12790 return SDValue();
12791
12792 assert(((V0EltSize % EltSize) == 0) &&((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"
) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12793, __PRETTY_FUNCTION__))
12793 "Scalar type sizes must all be powers of 2 on x86!")((((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"
) ? static_cast<void> (0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12793, __PRETTY_FUNCTION__))
;
12794
12795 const unsigned V0Opc = V0.getOpcode();
12796 const unsigned Scale = V0EltSize / EltSize;
12797 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12798
12799 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12800 V0Opc != ISD::BUILD_VECTOR)
12801 return SDValue();
12802
12803 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12804
12805 // If we're extracting non-least-significant bits, shift so we can truncate.
12806 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12807 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12808 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12809 if (const int OffsetIdx = BroadcastIdx % Scale)
12810 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12811 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12812
12813 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12814 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12815}
12816
12817/// Test whether this can be lowered with a single SHUFPS instruction.
12818///
12819/// This is used to disable more specialized lowerings when the shufps lowering
12820/// will happen to be efficient.
12821static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
12822 // This routine only handles 128-bit shufps.
12823 assert(Mask.size() == 4 && "Unsupported mask size!")((Mask.size() == 4 && "Unsupported mask size!") ? static_cast
<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12823, __PRETTY_FUNCTION__))
;
12824 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")((Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12824, __PRETTY_FUNCTION__))
;
12825 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")((Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12825, __PRETTY_FUNCTION__))
;
12826 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")((Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12826, __PRETTY_FUNCTION__))
;
12827 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")((Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"
) ? static_cast<void> (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12827, __PRETTY_FUNCTION__))
;
12828
12829 // To lower with a single SHUFPS we need to have the low half and high half
12830 // each requiring a single input.
12831 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12832 return false;
12833 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12834 return false;
12835
12836 return true;
12837}
12838
12839/// If we are extracting two 128-bit halves of a vector and shuffling the
12840/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12841/// multi-shuffle lowering.
12842static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
12843 SDValue N1, ArrayRef<int> Mask,
12844 SelectionDAG &DAG) {
12845 MVT VT = N0.getSimpleValueType();
12846 assert((VT.is128BitVector() &&(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12848, __PRETTY_FUNCTION__))
12847 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12848, __PRETTY_FUNCTION__))
12848 "VPERM* family of shuffles requires 32-bit or 64-bit elements")(((VT.is128BitVector() && (VT.getScalarSizeInBits() ==
32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12848, __PRETTY_FUNCTION__))
;
12849
12850 // Check that both sources are extracts of the same source vector.
12851 if (!N0.hasOneUse() || !N1.hasOneUse() ||
12852 N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12853 N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12854 N0.getOperand(0) != N1.getOperand(0))
12855 return SDValue();
12856
12857 SDValue WideVec = N0.getOperand(0);
12858 MVT WideVT = WideVec.getSimpleValueType();
12859 if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
12860 !isa<ConstantSDNode>(N1.getOperand(1)))
12861 return SDValue();
12862
12863 // Match extracts of each half of the wide source vector. Commute the shuffle
12864 // if the extract of the low half is N1.
12865 unsigned NumElts = VT.getVectorNumElements();
12866 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
12867 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12868 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12869 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12870 ShuffleVectorSDNode::commuteMask(NewMask);
12871 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12872 return SDValue();
12873
12874 // Final bailout: if the mask is simple, we are better off using an extract
12875 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12876 // because that avoids a constant load from memory.
12877 if (NumElts == 4 &&
12878 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
12879 return SDValue();
12880
12881 // Extend the shuffle mask with undef elements.
12882 NewMask.append(NumElts, -1);
12883
12884 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12885 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12886 NewMask);
12887 // This is free: ymm -> xmm.
12888 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12889 DAG.getIntPtrConstant(0, DL));
12890}
12891
12892/// Try to lower broadcast of a single element.
12893///
12894/// For convenience, this code also bundles all of the subtarget feature set
12895/// filtering. While a little annoying to re-dispatch on type here, there isn't
12896/// a convenient way to factor it out.
12897static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
12898 SDValue V2, ArrayRef<int> Mask,
12899 const X86Subtarget &Subtarget,
12900 SelectionDAG &DAG) {
12901 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12902 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
12903 (Subtarget.hasAVX2() && VT.isInteger())))
12904 return SDValue();
12905
12906 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12907 // we can only broadcast from a register with AVX2.
12908 unsigned NumEltBits = VT.getScalarSizeInBits();
12909 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12910 ? X86ISD::MOVDDUP
12911 : X86ISD::VBROADCAST;
12912 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12913
12914 // Check that the mask is a broadcast.
12915 int BroadcastIdx = getSplatIndex(Mask);
12916 if (BroadcastIdx < 0)
12917 return SDValue();
12918 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12920, __PRETTY_FUNCTION__))
12919 "a sorted mask where the broadcast "((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12920, __PRETTY_FUNCTION__))
12920 "comes from V1.")((BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
"a sorted mask where the broadcast " "comes from V1.") ? static_cast
<void> (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12920, __PRETTY_FUNCTION__))
;
12921
12922 // Go up the chain of (vector) values to find a scalar load that we can
12923 // combine with the broadcast.
12924 // TODO: Combine this logic with findEltLoadSrc() used by
12925 // EltsFromConsecutiveLoads().
12926 int BitOffset = BroadcastIdx * NumEltBits;
12927 SDValue V = V1;
12928 for (;;) {
12929 switch (V.getOpcode()) {
12930 case ISD::BITCAST: {
12931 V = V.getOperand(0);
12932 continue;
12933 }
12934 case ISD::CONCAT_VECTORS: {
12935 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12936 int OpIdx = BitOffset / OpBitWidth;
12937 V = V.getOperand(OpIdx);
12938 BitOffset %= OpBitWidth;
12939 continue;
12940 }
12941 case ISD::EXTRACT_SUBVECTOR: {
12942 auto *ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(1));
12943 if (!ConstantIdx)
12944 break;
12945
12946 // The extraction index adds to the existing offset.
12947 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12948 unsigned Idx = ConstantIdx->getZExtValue();
12949 unsigned BeginOffset = Idx * EltBitWidth;
12950 BitOffset += BeginOffset;
12951 V = V.getOperand(0);
12952 continue;
12953 }
12954 case ISD::INSERT_SUBVECTOR: {
12955 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12956 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
12957 if (!ConstantIdx)
12958 break;
12959
12960 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12961 int Idx = (int)ConstantIdx->getZExtValue();
12962 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12963 int BeginOffset = Idx * EltBitWidth;
12964 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12965 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12966 BitOffset -= BeginOffset;
12967 V = VInner;
12968 } else {
12969 V = VOuter;
12970 }
12971 continue;
12972 }
12973 }
12974 break;
12975 }
12976 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 12976, __PRETTY_FUNCTION__))
;
12977 BroadcastIdx = BitOffset / NumEltBits;
12978
12979 // Do we need to bitcast the source to retrieve the original broadcast index?
12980 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12981
12982 // Check if this is a broadcast of a scalar. We special case lowering
12983 // for scalars so that we can more effectively fold with loads.
12984 // If the original value has a larger element type than the shuffle, the
12985 // broadcast element is in essence truncated. Make that explicit to ease
12986 // folding.
12987 if (BitCastSrc && VT.isInteger())
12988 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12989 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12990 return TruncBroadcast;
12991
12992 // Also check the simpler case, where we can directly reuse the scalar.
12993 if (!BitCastSrc &&
12994 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12995 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12996 V = V.getOperand(BroadcastIdx);
12997
12998 // If we can't broadcast from a register, check that the input is a load.
12999 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13000 return SDValue();
13001 } else if (ISD::isNormalLoad(V.getNode()) &&
13002 cast<LoadSDNode>(V)->isSimple()) {
13003 // We do not check for one-use of the vector load because a broadcast load
13004 // is expected to be a win for code size, register pressure, and possibly
13005 // uops even if the original vector load is not eliminated.
13006
13007 // Reduce the vector load and shuffle to a broadcasted scalar load.
13008 LoadSDNode *Ld = cast<LoadSDNode>(V);
13009 SDValue BaseAddr = Ld->getOperand(1);
13010 MVT SVT = VT.getScalarType();
13011 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13012 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13012, __PRETTY_FUNCTION__))
;
13013 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
13014
13015 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13016 // than MOVDDUP.
13017 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13018 if (Opcode == X86ISD::VBROADCAST) {
13019 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13020 SDValue Ops[] = {Ld->getChain(), NewAddr};
13021 V = DAG.getMemIntrinsicNode(
13022 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13023 DAG.getMachineFunction().getMachineMemOperand(
13024 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13025 DAG.makeEquivalentMemoryOrdering(Ld, V);
13026 return DAG.getBitcast(VT, V);
13027 }
13028 assert(SVT == MVT::f64 && "Unexpected VT!")((SVT == MVT::f64 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13028, __PRETTY_FUNCTION__))
;
13029 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13030 DAG.getMachineFunction().getMachineMemOperand(
13031 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13032 DAG.makeEquivalentMemoryOrdering(Ld, V);
13033 } else if (!BroadcastFromReg) {
13034 // We can't broadcast from a vector register.
13035 return SDValue();
13036 } else if (BitOffset != 0) {
13037 // We can only broadcast from the zero-element of a vector register,
13038 // but it can be advantageous to broadcast from the zero-element of a
13039 // subvector.
13040 if (!VT.is256BitVector() && !VT.is512BitVector())
13041 return SDValue();
13042
13043 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13044 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13045 return SDValue();
13046
13047 // Only broadcast the zero-element of a 128-bit subvector.
13048 if ((BitOffset % 128) != 0)
13049 return SDValue();
13050
13051 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13052, __PRETTY_FUNCTION__))
13052 "Unexpected bit-offset")(((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"
) ? static_cast<void> (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13052, __PRETTY_FUNCTION__))
;
13053 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() ==
512) && "Unexpected vector size") ? static_cast<void
> (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13054, __PRETTY_FUNCTION__))
13054 "Unexpected vector size")(((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() ==
512) && "Unexpected vector size") ? static_cast<void
> (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13054, __PRETTY_FUNCTION__))
;
13055 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13056 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13057 }
13058
13059 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
13060 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
13061 DAG.getBitcast(MVT::f64, V));
13062
13063 // If this is a scalar, do the broadcast on this type and bitcast.
13064 if (!V.getValueType().isVector()) {
13065 assert(V.getScalarValueSizeInBits() == NumEltBits &&((V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size"
) ? static_cast<void> (0) : __assert_fail ("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13066, __PRETTY_FUNCTION__))
13066 "Unexpected scalar size")((V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size"
) ? static_cast<void> (0) : __assert_fail ("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13066, __PRETTY_FUNCTION__))
;
13067 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13068 VT.getVectorNumElements());
13069 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13070 }
13071
13072 // We only support broadcasting from 128-bit vectors to minimize the
13073 // number of patterns we need to deal with in isel. So extract down to
13074 // 128-bits, removing as many bitcasts as possible.
13075 if (V.getValueSizeInBits() > 128)
13076 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
13077
13078 // Otherwise cast V to a vector with the same element type as VT, but
13079 // possibly narrower than VT. Then perform the broadcast.
13080 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13081 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13082 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13083}
13084
13085// Check for whether we can use INSERTPS to perform the shuffle. We only use
13086// INSERTPS when the V1 elements are already in the correct locations
13087// because otherwise we can just always use two SHUFPS instructions which
13088// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13089// perform INSERTPS if a single V1 element is out of place and all V2
13090// elements are zeroable.
13091static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
13092 unsigned &InsertPSMask,
13093 const APInt &Zeroable,
13094 ArrayRef<int> Mask, SelectionDAG &DAG) {
13095 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")((V1.getSimpleValueType().is128BitVector() && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13095, __PRETTY_FUNCTION__))
;
13096 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")((V2.getSimpleValueType().is128BitVector() && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13096, __PRETTY_FUNCTION__))
;
13097 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13097, __PRETTY_FUNCTION__))
;
13098
13099 // Attempt to match INSERTPS with one element from VA or VB being
13100 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13101 // are updated.
13102 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13103 ArrayRef<int> CandidateMask) {
13104 unsigned ZMask = 0;
13105 int VADstIndex = -1;
13106 int VBDstIndex = -1;
13107 bool VAUsedInPlace = false;
13108
13109 for (int i = 0; i < 4; ++i) {
13110 // Synthesize a zero mask from the zeroable elements (includes undefs).
13111 if (Zeroable[i]) {
13112 ZMask |= 1 << i;
13113 continue;
13114 }
13115
13116 // Flag if we use any VA inputs in place.
13117 if (i == CandidateMask[i]) {
13118 VAUsedInPlace = true;
13119 continue;
13120 }
13121
13122 // We can only insert a single non-zeroable element.
13123 if (VADstIndex >= 0 || VBDstIndex >= 0)
13124 return false;
13125
13126 if (CandidateMask[i] < 4) {
13127 // VA input out of place for insertion.
13128 VADstIndex = i;
13129 } else {
13130 // VB input for insertion.
13131 VBDstIndex = i;
13132 }
13133 }
13134
13135 // Don't bother if we have no (non-zeroable) element for insertion.
13136 if (VADstIndex < 0 && VBDstIndex < 0)
13137 return false;
13138
13139 // Determine element insertion src/dst indices. The src index is from the
13140 // start of the inserted vector, not the start of the concatenated vector.
13141 unsigned VBSrcIndex = 0;
13142 if (VADstIndex >= 0) {
13143 // If we have a VA input out of place, we use VA as the V2 element
13144 // insertion and don't use the original V2 at all.
13145 VBSrcIndex = CandidateMask[VADstIndex];
13146 VBDstIndex = VADstIndex;
13147 VB = VA;
13148 } else {
13149 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13150 }
13151
13152 // If no V1 inputs are used in place, then the result is created only from
13153 // the zero mask and the V2 insertion - so remove V1 dependency.
13154 if (!VAUsedInPlace)
13155 VA = DAG.getUNDEF(MVT::v4f32);
13156
13157 // Update V1, V2 and InsertPSMask accordingly.
13158 V1 = VA;
13159 V2 = VB;
13160
13161 // Insert the V2 element into the desired position.
13162 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13163 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"
) ? static_cast<void> (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13163, __PRETTY_FUNCTION__))
;
13164 return true;
13165 };
13166
13167 if (matchAsInsertPS(V1, V2, Mask))
13168 return true;
13169
13170 // Commute and try again.
13171 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
13172 ShuffleVectorSDNode::commuteMask(CommutedMask);
13173 if (matchAsInsertPS(V2, V1, CommutedMask))
13174 return true;
13175
13176 return false;
13177}
13178
13179static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,
13180 ArrayRef<int> Mask, const APInt &Zeroable,
13181 SelectionDAG &DAG) {
13182 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13182, __PRETTY_FUNCTION__))
;
13183 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13183, __PRETTY_FUNCTION__))
;
13184
13185 // Attempt to match the insertps pattern.
13186 unsigned InsertPSMask;
13187 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13188 return SDValue();
13189
13190 // Insert the V2 element into the desired position.
13191 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13192 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13193}
13194
13195/// Try to lower a shuffle as a permute of the inputs followed by an
13196/// UNPCK instruction.
13197///
13198/// This specifically targets cases where we end up with alternating between
13199/// the two inputs, and so can permute them into something that feeds a single
13200/// UNPCK instruction. Note that this routine only targets integer vectors
13201/// because for floating point vectors we have a generalized SHUFPS lowering
13202/// strategy that handles everything that doesn't *exactly* match an unpack,
13203/// making this clever lowering unnecessary.
13204static SDValue lowerShuffleAsPermuteAndUnpack(
13205 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13206 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13207 assert(!VT.isFloatingPoint() &&((!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13208, __PRETTY_FUNCTION__))
13208 "This routine only supports integer vectors.")((!VT.isFloatingPoint() && "This routine only supports integer vectors."
) ? static_cast<void> (0) : __assert_fail ("!VT.isFloatingPoint() && \"This routine only supports integer vectors.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13208, __PRETTY_FUNCTION__))
;
13209 assert(VT.is128BitVector() &&((VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13210, __PRETTY_FUNCTION__))
13210 "This routine only works on 128-bit vectors.")((VT.is128BitVector() && "This routine only works on 128-bit vectors."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"This routine only works on 128-bit vectors.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13210, __PRETTY_FUNCTION__))
;
13211 assert(!V2.isUndef() &&((!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13212, __PRETTY_FUNCTION__))
13212 "This routine should only be used when blending two inputs.")((!V2.isUndef() && "This routine should only be used when blending two inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine should only be used when blending two inputs.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13212, __PRETTY_FUNCTION__))
;
13213 assert(Mask.size() >= 2 && "Single element masks are invalid.")((Mask.size() >= 2 && "Single element masks are invalid."
) ? static_cast<void> (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13213, __PRETTY_FUNCTION__))
;
13214
13215 int Size = Mask.size();
13216
13217 int NumLoInputs =
13218 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
13219 int NumHiInputs =
13220 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
13221
13222 bool UnpackLo = NumLoInputs >= NumHiInputs;
13223
13224 auto TryUnpack = [&](int ScalarSize, int Scale) {
13225 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
13226 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
13227
13228 for (int i = 0; i < Size; ++i) {
13229 if (Mask[i] < 0)
13230 continue;
13231
13232 // Each element of the unpack contains Scale elements from this mask.
13233 int UnpackIdx = i / Scale;
13234
13235 // We only handle the case where V1 feeds the first slots of the unpack.
13236 // We rely on canonicalization to ensure this is the case.
13237 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
13238 return SDValue();
13239
13240 // Setup the mask for this input. The indexing is tricky as we have to
13241 // handle the unpack stride.
13242 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
13243 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
13244 Mask[i] % Size;
13245 }
13246
13247 // If we will have to shuffle both inputs to use the unpack, check whether
13248 // we can just unpack first and shuffle the result. If so, skip this unpack.
13249 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
13250 !isNoopShuffleMask(V2Mask))
13251 return SDValue();
13252
13253 // Shuffle the inputs into place.
13254 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
13255 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
13256
13257 // Cast the inputs to the type we will use to unpack them.
13258 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
13259 V1 = DAG.getBitcast(UnpackVT, V1);
13260 V2 = DAG.getBitcast(UnpackVT, V2);
13261
13262 // Unpack the inputs and cast the result back to the desired type.
13263 return DAG.getBitcast(
13264 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
13265 UnpackVT, V1, V2));
13266 };
13267
13268 // We try each unpack from the largest to the smallest to try and find one
13269 // that fits this mask.
13270 int OrigScalarSize = VT.getScalarSizeInBits();
13271 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
13272 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
13273 return Unpack;
13274
13275 // If we're shuffling with a zero vector then we're better off not doing
13276 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
13277 if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
13278 ISD::isBuildVectorAllZeros(V2.getNode()))
13279 return SDValue();
13280
13281 // If none of the unpack-rooted lowerings worked (or were profitable) try an
13282 // initial unpack.
13283 if (NumLoInputs == 0 || NumHiInputs == 0) {
13284 assert((NumLoInputs > 0 || NumHiInputs > 0) &&(((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"
) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13285, __PRETTY_FUNCTION__))
13285 "We have to have *some* inputs!")(((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"
) ? static_cast<void> (0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13285, __PRETTY_FUNCTION__))
;
13286 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
13287
13288 // FIXME: We could consider the total complexity of the permute of each
13289 // possible unpacking. Or at the least we should consider how many
13290 // half-crossings are created.
13291 // FIXME: We could consider commuting the unpacks.
13292
13293 SmallVector<int, 32> PermMask((unsigned)Size, -1);
13294 for (int i = 0; i < Size; ++i) {
13295 if (Mask[i] < 0)
13296 continue;
13297
13298 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")((Mask[i] % Size >= HalfOffset && "Found input from wrong half!"
) ? static_cast<void> (0) : __assert_fail ("Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13298, __PRETTY_FUNCTION__))
;
13299
13300 PermMask[i] =
13301 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
13302 }
13303 return DAG.getVectorShuffle(
13304 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
13305 DL, VT, V1, V2),
13306 DAG.getUNDEF(VT), PermMask);
13307 }
13308
13309 return SDValue();
13310}
13311
13312/// Handle lowering of 2-lane 64-bit floating point shuffles.
13313///
13314/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13315/// support for floating point shuffles but not integer shuffles. These
13316/// instructions will incur a domain crossing penalty on some chips though so
13317/// it is better to avoid lowering through this for integer vectors where
13318/// possible.
13319static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13320 const APInt &Zeroable, SDValue V1, SDValue V2,
13321 const X86Subtarget &Subtarget,
13322 SelectionDAG &DAG) {
13323 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13323, __PRETTY_FUNCTION__))
;
13324 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13324, __PRETTY_FUNCTION__))
;
13325 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13325, __PRETTY_FUNCTION__))
;
13326
13327 if (V2.isUndef()) {
13328 // Check for being able to broadcast a single element.
13329 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13330 Mask, Subtarget, DAG))
13331 return Broadcast;
13332
13333 // Straight shuffle of a single input vector. Simulate this by using the
13334 // single input as both of the "inputs" to this instruction..
13335 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13336
13337 if (Subtarget.hasAVX()) {
13338 // If we have AVX, we can use VPERMILPS which will allow folding a load
13339 // into the shuffle.
13340 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13341 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13342 }
13343
13344 return DAG.getNode(
13345 X86ISD::SHUFP, DL, MVT::v2f64,
13346 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13347 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13348 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13349 }
13350 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13350, __PRETTY_FUNCTION__))
;
13351 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13351, __PRETTY_FUNCTION__))
;
13352 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input."
) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13352, __PRETTY_FUNCTION__))
;
13353 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input."
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13353, __PRETTY_FUNCTION__))
;
13354
13355 if (Subtarget.hasAVX2())
13356 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13357 return Extract;
13358
13359 // When loading a scalar and then shuffling it into a vector we can often do
13360 // the insertion cheaply.
13361 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13362 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13363 return Insertion;
13364 // Try inverting the insertion since for v2 masks it is easy to do and we
13365 // can't reliably sort the mask one way or the other.
13366 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13367 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13368 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13369 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13370 return Insertion;
13371
13372 // Try to use one of the special instruction patterns to handle two common
13373 // blend patterns if a zero-blend above didn't work.
13374 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
13375 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
13376 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13377 // We can either use a special instruction to load over the low double or
13378 // to move just the low double.
13379 return DAG.getNode(
13380 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13381 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13382
13383 if (Subtarget.hasSSE41())
13384 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13385 Zeroable, Subtarget, DAG))
13386 return Blend;
13387
13388 // Use dedicated unpack instructions for masks that match their pattern.
13389 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
13390 return V;
13391
13392 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13393 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13394 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13395}
13396
13397/// Handle lowering of 2-lane 64-bit integer shuffles.
13398///
13399/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13400/// the integer unit to minimize domain crossing penalties. However, for blends
13401/// it falls back to the floating point shuffle operation with appropriate bit
13402/// casting.
13403static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13404 const APInt &Zeroable, SDValue V1, SDValue V2,
13405 const X86Subtarget &Subtarget,
13406 SelectionDAG &DAG) {
13407 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13407, __PRETTY_FUNCTION__))
;
13408 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13408, __PRETTY_FUNCTION__))
;
13409 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")((Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13409, __PRETTY_FUNCTION__))
;
13410
13411 if (V2.isUndef()) {
13412 // Check for being able to broadcast a single element.
13413 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13414 Mask, Subtarget, DAG))
13415 return Broadcast;
13416
13417 // Straight shuffle of a single input vector. For everything from SSE2
13418 // onward this has a single fast instruction with no scary immediates.
13419 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13420 V1 = DAG.getBitcast(MVT::v4i32, V1);
13421 int WidenedMask[4] = {
13422 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
13423 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
13424 return DAG.getBitcast(
13425 MVT::v2i64,
13426 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13427 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13428 }
13429 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13429, __PRETTY_FUNCTION__))
;
13430 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")((Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? static_cast<void> (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13430, __PRETTY_FUNCTION__))
;
13431 assert(Mask[0] < 2 && "We sort V1 to be the first input.")((Mask[0] < 2 && "We sort V1 to be the first input."
) ? static_cast<void> (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13431, __PRETTY_FUNCTION__))
;
13432 assert(Mask[1] >= 2 && "We sort V2 to be the second input.")((Mask[1] >= 2 && "We sort V2 to be the second input."
) ? static_cast<void> (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13432, __PRETTY_FUNCTION__))
;
13433
13434 if (Subtarget.hasAVX2())
13435 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13436 return Extract;
13437
13438 // Try to use shift instructions.
13439 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
13440 Zeroable, Subtarget, DAG))
13441 return Shift;
13442
13443 // When loading a scalar and then shuffling it into a vector we can often do
13444 // the insertion cheaply.
13445 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13446 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13447 return Insertion;
13448 // Try inverting the insertion since for v2 masks it is easy to do and we
13449 // can't reliably sort the mask one way or the other.
13450 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13451 if (SDValue Insertion = lowerShuffleAsElementInsertion(
13452 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13453 return Insertion;
13454
13455 // We have different paths for blend lowering, but they all must use the
13456 // *exact* same predicate.
13457 bool IsBlendSupported = Subtarget.hasSSE41();
13458 if (IsBlendSupported)
13459 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13460 Zeroable, Subtarget, DAG))
13461 return Blend;
13462
13463 // Use dedicated unpack instructions for masks that match their pattern.
13464 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
13465 return V;
13466
13467 // Try to use byte rotation instructions.
13468 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13469 if (Subtarget.hasSSSE3()) {
13470 if (Subtarget.hasVLX())
13471 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13472 Subtarget, DAG))
13473 return Rotate;
13474
13475 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13476 Subtarget, DAG))
13477 return Rotate;
13478 }
13479
13480 // If we have direct support for blends, we should lower by decomposing into
13481 // a permute. That will be faster than the domain cross.
13482 if (IsBlendSupported)
13483 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask,
13484 Subtarget, DAG);
13485
13486 // We implement this with SHUFPD which is pretty lame because it will likely
13487 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13488 // However, all the alternatives are still more cycles and newer chips don't
13489 // have this problem. It would be really nice if x86 had better shuffles here.
13490 V1 = DAG.getBitcast(MVT::v2f64, V1);
13491 V2 = DAG.getBitcast(MVT::v2f64, V2);
13492 return DAG.getBitcast(MVT::v2i64,
13493 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13494}
13495
13496/// Lower a vector shuffle using the SHUFPS instruction.
13497///
13498/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13499/// It makes no assumptions about whether this is the *best* lowering, it simply
13500/// uses it.
13501static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
13502 ArrayRef<int> Mask, SDValue V1,
13503 SDValue V2, SelectionDAG &DAG) {
13504 SDValue LowV = V1, HighV = V2;
13505 SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
13506 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13507
13508 if (NumV2Elements == 1) {
13509 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13510
13511 // Compute the index adjacent to V2Index and in the same half by toggling
13512 // the low bit.
13513 int V2AdjIndex = V2Index ^ 1;
13514
13515 if (Mask[V2AdjIndex] < 0) {
13516 // Handles all the cases where we have a single V2 element and an undef.
13517 // This will only ever happen in the high lanes because we commute the
13518 // vector otherwise.
13519 if (V2Index < 2)
13520 std::swap(LowV, HighV);
13521 NewMask[V2Index] -= 4;
13522 } else {
13523 // Handle the case where the V2 element ends up adjacent to a V1 element.
13524 // To make this work, blend them together as the first step.
13525 int V1Index = V2AdjIndex;
13526 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13527 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13528 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13529
13530 // Now proceed to reconstruct the final blend as we have the necessary
13531 // high or low half formed.
13532 if (V2Index < 2) {
13533 LowV = V2;
13534 HighV = V1;
13535 } else {
13536 HighV = V2;
13537 }
13538 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13539 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13540 }
13541 } else if (NumV2Elements == 2) {
13542 if (Mask[0] < 4 && Mask[1] < 4) {
13543 // Handle the easy case where we have V1 in the low lanes and V2 in the
13544 // high lanes.
13545 NewMask[2] -= 4;
13546 NewMask[3] -= 4;
13547 } else if (Mask[2] < 4 && Mask[3] < 4) {
13548 // We also handle the reversed case because this utility may get called
13549 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13550 // arrange things in the right direction.
13551 NewMask[0] -= 4;
13552 NewMask[1] -= 4;
13553 HighV = V1;
13554 LowV = V2;
13555 } else {
13556 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13557 // trying to place elements directly, just blend them and set up the final
13558 // shuffle to place them.
13559
13560 // The first two blend mask elements are for V1, the second two are for
13561 // V2.
13562 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13563 Mask[2] < 4 ? Mask[2] : Mask[3],
13564 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13565 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13566 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13567 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13568
13569 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13570 // a blend.
13571 LowV = HighV = V1;
13572 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13573 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13574 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13575 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13576 }
13577 }
13578 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13579 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13580}
13581
13582/// Lower 4-lane 32-bit floating point shuffles.
13583///
13584/// Uses instructions exclusively from the floating point unit to minimize
13585/// domain crossing penalties, as these are sufficient to implement all v4f32
13586/// shuffles.
13587static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13588 const APInt &Zeroable, SDValue V1, SDValue V2,
13589 const X86Subtarget &Subtarget,
13590 SelectionDAG &DAG) {
13591 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13591, __PRETTY_FUNCTION__))
;
13592 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13592, __PRETTY_FUNCTION__))
;
13593 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13593, __PRETTY_FUNCTION__))
;
13594
13595 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13596
13597 if (NumV2Elements == 0) {
13598 // Check for being able to broadcast a single element.
13599 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13600 Mask, Subtarget, DAG))
13601 return Broadcast;
13602
13603 // Use even/odd duplicate instructions for masks that match their pattern.
13604 if (Subtarget.hasSSE3()) {
13605 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13606 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13607 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
13608 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13609 }
13610
13611 if (Subtarget.hasAVX()) {
13612 // If we have AVX, we can use VPERMILPS which will allow folding a load
13613 // into the shuffle.
13614 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13615 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13616 }
13617
13618 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13619 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13620 if (!Subtarget.hasSSE2()) {
13621 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
13622 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13623 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
13624 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13625 }
13626
13627 // Otherwise, use a straight shuffle of a single input vector. We pass the
13628 // input vector to both operands to simulate this with a SHUFPS.
13629 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13630 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13631 }
13632
13633 if (Subtarget.hasAVX2())
13634 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13635 return Extract;
13636
13637 // There are special ways we can lower some single-element blends. However, we
13638 // have custom ways we can lower more complex single-element blends below that
13639 // we defer to if both this and BLENDPS fail to match, so restrict this to
13640 // when the V2 input is targeting element 0 of the mask -- that is the fast
13641 // case here.
13642 if (NumV2Elements == 1 && Mask[0] >= 4)
13643 if (SDValue V = lowerShuffleAsElementInsertion(
13644 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13645 return V;
13646
13647 if (Subtarget.hasSSE41()) {
13648 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13649 Zeroable, Subtarget, DAG))
13650 return Blend;
13651
13652 // Use INSERTPS if we can complete the shuffle efficiently.
13653 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13654 return V;
13655
13656 if (!isSingleSHUFPSMask(Mask))
13657 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13658 V2, Mask, DAG))
13659 return BlendPerm;
13660 }
13661
13662 // Use low/high mov instructions. These are only valid in SSE1 because
13663 // otherwise they are widened to v2f64 and never get here.
13664 if (!Subtarget.hasSSE2()) {
13665 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
13666 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13667 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
13668 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13669 }
13670
13671 // Use dedicated unpack instructions for masks that match their pattern.
13672 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
13673 return V;
13674
13675 // Otherwise fall back to a SHUFPS lowering strategy.
13676 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13677}
13678
13679/// Lower 4-lane i32 vector shuffles.
13680///
13681/// We try to handle these with integer-domain shuffles where we can, but for
13682/// blends we use the floating point domain blend instructions.
13683static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
13684 const APInt &Zeroable, SDValue V1, SDValue V2,
13685 const X86Subtarget &Subtarget,
13686 SelectionDAG &DAG) {
13687 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13687, __PRETTY_FUNCTION__))
;
13688 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13688, __PRETTY_FUNCTION__))
;
13689 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13689, __PRETTY_FUNCTION__))
;
13690
13691 // Whenever we can lower this as a zext, that instruction is strictly faster
13692 // than any alternative. It also allows us to fold memory operands into the
13693 // shuffle in many cases.
13694 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13695 Zeroable, Subtarget, DAG))
13696 return ZExt;
13697
13698 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13699
13700 if (NumV2Elements == 0) {
13701 // Try to use broadcast unless the mask only has one non-undef element.
13702 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13703 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13704 Mask, Subtarget, DAG))
13705 return Broadcast;
13706 }
13707
13708 // Straight shuffle of a single input vector. For everything from SSE2
13709 // onward this has a single fast instruction with no scary immediates.
13710 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13711 // but we aren't actually going to use the UNPCK instruction because doing
13712 // so prevents folding a load into this instruction or making a copy.
13713 const int UnpackLoMask[] = {0, 0, 1, 1};
13714 const int UnpackHiMask[] = {2, 2, 3, 3};
13715 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
13716 Mask = UnpackLoMask;
13717 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
13718 Mask = UnpackHiMask;
13719
13720 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13721 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13722 }
13723
13724 if (Subtarget.hasAVX2())
13725 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13726 return Extract;
13727
13728 // Try to use shift instructions.
13729 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
13730 Zeroable, Subtarget, DAG))
13731 return Shift;
13732
13733 // There are special ways we can lower some single-element blends.
13734 if (NumV2Elements == 1)
13735 if (SDValue V = lowerShuffleAsElementInsertion(
13736 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13737 return V;
13738
13739 // We have different paths for blend lowering, but they all must use the
13740 // *exact* same predicate.
13741 bool IsBlendSupported = Subtarget.hasSSE41();
13742 if (IsBlendSupported)
13743 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13744 Zeroable, Subtarget, DAG))
13745 return Blend;
13746
13747 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13748 Zeroable, Subtarget, DAG))
13749 return Masked;
13750
13751 // Use dedicated unpack instructions for masks that match their pattern.
13752 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13753 return V;
13754
13755 // Try to use byte rotation instructions.
13756 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13757 if (Subtarget.hasSSSE3()) {
13758 if (Subtarget.hasVLX())
13759 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13760 Subtarget, DAG))
13761 return Rotate;
13762
13763 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13764 Subtarget, DAG))
13765 return Rotate;
13766 }
13767
13768 // Assume that a single SHUFPS is faster than an alternative sequence of
13769 // multiple instructions (even if the CPU has a domain penalty).
13770 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13771 if (!isSingleSHUFPSMask(Mask)) {
13772 // If we have direct support for blends, we should lower by decomposing into
13773 // a permute. That will be faster than the domain cross.
13774 if (IsBlendSupported)
13775 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask,
13776 Subtarget, DAG);
13777
13778 // Try to lower by permuting the inputs into an unpack instruction.
13779 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13780 Mask, Subtarget, DAG))
13781 return Unpack;
13782 }
13783
13784 // We implement this with SHUFPS because it can blend from two vectors.
13785 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13786 // up the inputs, bypassing domain shift penalties that we would incur if we
13787 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13788 // relevant.
13789 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13790 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13791 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13792 return DAG.getBitcast(MVT::v4i32, ShufPS);
13793}
13794
13795/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13796/// shuffle lowering, and the most complex part.
13797///
13798/// The lowering strategy is to try to form pairs of input lanes which are
13799/// targeted at the same half of the final vector, and then use a dword shuffle
13800/// to place them onto the right half, and finally unpack the paired lanes into
13801/// their final position.
13802///
13803/// The exact breakdown of how to form these dword pairs and align them on the
13804/// correct sides is really tricky. See the comments within the function for
13805/// more of the details.
13806///
13807/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13808/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13809/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13810/// vector, form the analogous 128-bit 8-element Mask.
13811static SDValue lowerV8I16GeneralSingleInputShuffle(
13812 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13813 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13814 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")((VT.getVectorElementType() == MVT::i16 && "Bad input type!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13814, __PRETTY_FUNCTION__))
;
13815 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13816
13817 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")((Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13817, __PRETTY_FUNCTION__))
;
13818 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13819 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13820
13821 // Attempt to directly match PSHUFLW or PSHUFHW.
13822 if (isUndefOrInRange(LoMask, 0, 4) &&
13823 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13824 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13825 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13826 }
13827 if (isUndefOrInRange(HiMask, 4, 8) &&
13828 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13829 for (int i = 0; i != 4; ++i)
13830 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13831 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13832 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13833 }
13834
13835 SmallVector<int, 4> LoInputs;
13836 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13837 array_pod_sort(LoInputs.begin(), LoInputs.end());
13838 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
13839 SmallVector<int, 4> HiInputs;
13840 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13841 array_pod_sort(HiInputs.begin(), HiInputs.end());
13842 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
13843 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13844 int NumHToL = LoInputs.size() - NumLToL;
13845 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13846 int NumHToH = HiInputs.size() - NumLToH;
13847 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13848 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13849 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13850 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13851
13852 // If we are shuffling values from one half - check how many different DWORD
13853 // pairs we need to create. If only 1 or 2 then we can perform this as a
13854 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13855 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13856 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13857 V = DAG.getNode(ShufWOp, DL, VT, V,
13858 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13859 V = DAG.getBitcast(PSHUFDVT, V);
13860 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13861 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13862 return DAG.getBitcast(VT, V);
13863 };
13864
13865 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13866 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13867 SmallVector<std::pair<int, int>, 4> DWordPairs;
13868 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13869
13870 // Collect the different DWORD pairs.
13871 for (int DWord = 0; DWord != 4; ++DWord) {
13872 int M0 = Mask[2 * DWord + 0];
13873 int M1 = Mask[2 * DWord + 1];
13874 M0 = (M0 >= 0 ? M0 % 4 : M0);
13875 M1 = (M1 >= 0 ? M1 % 4 : M1);
13876 if (M0 < 0 && M1 < 0)
13877 continue;
13878
13879 bool Match = false;
13880 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13881 auto &DWordPair = DWordPairs[j];
13882 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13883 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13884 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13885 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13886 PSHUFDMask[DWord] = DOffset + j;
13887 Match = true;
13888 break;
13889 }
13890 }
13891 if (!Match) {
13892 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13893 DWordPairs.push_back(std::make_pair(M0, M1));
13894 }
13895 }
13896
13897 if (DWordPairs.size() <= 2) {
13898 DWordPairs.resize(2, std::make_pair(-1, -1));
13899 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13900 DWordPairs[1].first, DWordPairs[1].second};
13901 if ((NumHToL + NumHToH) == 0)
13902 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13903 if ((NumLToL + NumLToH) == 0)
13904 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13905 }
13906 }
13907
13908 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13909 // such inputs we can swap two of the dwords across the half mark and end up
13910 // with <=2 inputs to each half in each half. Once there, we can fall through
13911 // to the generic code below. For example:
13912 //
13913 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13914 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13915 //
13916 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13917 // and an existing 2-into-2 on the other half. In this case we may have to
13918 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13919 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13920 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13921 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13922 // half than the one we target for fixing) will be fixed when we re-enter this
13923 // path. We will also combine away any sequence of PSHUFD instructions that
13924 // result into a single instruction. Here is an example of the tricky case:
13925 //
13926 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13927 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13928 //
13929 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13930 //
13931 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13932 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13933 //
13934 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13935 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13936 //
13937 // The result is fine to be handled by the generic logic.
13938 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13939 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13940 int AOffset, int BOffset) {
13941 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13942, __PRETTY_FUNCTION__))
13942 "Must call this with A having 3 or 1 inputs from the A half.")(((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
"Must call this with A having 3 or 1 inputs from the A half."
) ? static_cast<void> (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13942, __PRETTY_FUNCTION__))
;
13943 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13944, __PRETTY_FUNCTION__))
13944 "Must call this with B having 1 or 3 inputs from the B half.")(((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
"Must call this with B having 1 or 3 inputs from the B half."
) ? static_cast<void> (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13944, __PRETTY_FUNCTION__))
;
13945 assert(AToAInputs.size() + BToAInputs.size() == 4 &&((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13946, __PRETTY_FUNCTION__))
13946 "Must call this with either 3:1 or 1:3 inputs (summing to 4).")((AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? static_cast<void> (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 13946, __PRETTY_FUNCTION__))
;
13947
13948 bool ThreeAInputs = AToAInputs.size() == 3;
13949
13950 // Compute the index of dword with only one word among the three inputs in
13951 // a half by taking the sum of the half with three inputs and subtracting
13952 // the sum of the actual three inputs. The difference is the remaining
13953 // slot.
13954 int ADWord = 0, BDWord = 0;
13955 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13956 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13957 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13958 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13959 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13960 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13961 int TripleNonInputIdx =
13962 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13963 TripleDWord = TripleNonInputIdx / 2;
13964
13965 // We use xor with one to compute the adjacent DWord to whichever one the
13966 // OneInput is in.
13967 OneInputDWord = (OneInput / 2) ^ 1;
13968
13969 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13970 // and BToA inputs. If there is also such a problem with the BToB and AToB
13971 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13972 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13973 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13974 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13975 // Compute how many inputs will be flipped by swapping these DWords. We
13976 // need
13977 // to balance this to ensure we don't form a 3-1 shuffle in the other
13978 // half.
13979 int NumFlippedAToBInputs =
13980 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
13981 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
13982 int NumFlippedBToBInputs =
13983 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
13984 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
13985 if ((NumFlippedAToBInputs == 1 &&
13986 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13987 (NumFlippedBToBInputs == 1 &&
13988 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13989 // We choose whether to fix the A half or B half based on whether that
13990 // half has zero flipped inputs. At zero, we may not be able to fix it
13991 // with that half. We also bias towards fixing the B half because that
13992 // will more commonly be the high half, and we have to bias one way.
13993 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13994 ArrayRef<int> Inputs) {
13995 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13996 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13997 // Determine whether the free index is in the flipped dword or the
13998 // unflipped dword based on where the pinned index is. We use this bit
13999 // in an xor to conditionally select the adjacent dword.
14000 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14001 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14002 if (IsFixIdxInput == IsFixFreeIdxInput)
14003 FixFreeIdx += 1;
14004 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14005 assert(IsFixIdxInput != IsFixFreeIdxInput &&((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14006, __PRETTY_FUNCTION__))
14006 "We need to be changing the number of flipped inputs!")((IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"
) ? static_cast<void> (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14006, __PRETTY_FUNCTION__))
;
14007 int PSHUFHalfMask[] = {0, 1, 2, 3};
14008 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14009 V = DAG.getNode(
14010 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14011 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14012 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14013
14014 for (int &M : Mask)
14015 if (M >= 0 && M == FixIdx)
14016 M = FixFreeIdx;
14017 else if (M >= 0 && M == FixFreeIdx)
14018 M = FixIdx;
14019 };
14020 if (NumFlippedBToBInputs != 0) {
14021 int BPinnedIdx =
14022 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14023 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14024 } else {
14025 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")((NumFlippedAToBInputs != 0 && "Impossible given predicates!"
) ? static_cast<void> (0) : __assert_fail ("NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14025, __PRETTY_FUNCTION__))
;
14026 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14027 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14028 }
14029 }
14030 }
14031
14032 int PSHUFDMask[] = {0, 1, 2, 3};
14033 PSHUFDMask[ADWord] = BDWord;
14034 PSHUFDMask[BDWord] = ADWord;
14035 V = DAG.getBitcast(
14036 VT,
14037 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14038 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14039
14040 // Adjust the mask to match the new locations of A and B.
14041 for (int &M : Mask)
14042 if (M >= 0 && M/2 == ADWord)
14043 M = 2 * BDWord + M % 2;
14044 else if (M >= 0 && M/2 == BDWord)
14045 M = 2 * ADWord + M % 2;
14046
14047 // Recurse back into this routine to re-compute state now that this isn't
14048 // a 3 and 1 problem.
14049 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14050 };
14051 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14052 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14053 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14054 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14055
14056 // At this point there are at most two inputs to the low and high halves from
14057 // each half. That means the inputs can always be grouped into dwords and
14058 // those dwords can then be moved to the correct half with a dword shuffle.
14059 // We use at most one low and one high word shuffle to collect these paired
14060 // inputs into dwords, and finally a dword shuffle to place them.
14061 int PSHUFLMask[4] = {-1, -1, -1, -1};
14062 int PSHUFHMask[4] = {-1, -1, -1, -1};
14063 int PSHUFDMask[4] = {-1, -1, -1, -1};
14064
14065 // First fix the masks for all the inputs that are staying in their
14066 // original halves. This will then dictate the targets of the cross-half
14067 // shuffles.
14068 auto fixInPlaceInputs =
14069 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14070 MutableArrayRef<int> SourceHalfMask,
14071 MutableArrayRef<int> HalfMask, int HalfOffset) {
14072 if (InPlaceInputs.empty())
14073 return;
14074 if (InPlaceInputs.size() == 1) {
14075 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14076 InPlaceInputs[0] - HalfOffset;
14077 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14078 return;
14079 }
14080 if (IncomingInputs.empty()) {
14081 // Just fix all of the in place inputs.
14082 for (int Input : InPlaceInputs) {
14083 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14084 PSHUFDMask[Input / 2] = Input / 2;
14085 }
14086 return;
14087 }
14088
14089 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")((InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"
) ? static_cast<void> (0) : __assert_fail ("InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14089, __PRETTY_FUNCTION__))
;
14090 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14091 InPlaceInputs[0] - HalfOffset;
14092 // Put the second input next to the first so that they are packed into
14093 // a dword. We find the adjacent index by toggling the low bit.
14094 int AdjIndex = InPlaceInputs[0] ^ 1;
14095 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14096 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
14097 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14098 };
14099 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14100 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14101
14102 // Now gather the cross-half inputs and place them into a free dword of
14103 // their target half.
14104 // FIXME: This operation could almost certainly be simplified dramatically to
14105 // look more like the 3-1 fixing operation.
14106 auto moveInputsToRightHalf = [&PSHUFDMask](
14107 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14108 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14109 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14110 int DestOffset) {
14111 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14112 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14113 };
14114 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14115 int Word) {
14116 int LowWord = Word & ~1;
14117 int HighWord = Word | 1;
14118 return isWordClobbered(SourceHalfMask, LowWord) ||
14119 isWordClobbered(SourceHalfMask, HighWord);
14120 };
14121
14122 if (IncomingInputs.empty())
14123 return;
14124
14125 if (ExistingInputs.empty()) {
14126 // Map any dwords with inputs from them into the right half.
14127 for (int Input : IncomingInputs) {
14128 // If the source half mask maps over the inputs, turn those into
14129 // swaps and use the swapped lane.
14130 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14131 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14132 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14133 Input - SourceOffset;
14134 // We have to swap the uses in our half mask in one sweep.
14135 for (int &M : HalfMask)
14136 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14137 M = Input;
14138 else if (M == Input)
14139 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14140 } else {
14141 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14143, __PRETTY_FUNCTION__))
14142 Input - SourceOffset &&((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14143, __PRETTY_FUNCTION__))
14143 "Previous placement doesn't match!")((SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input
- SourceOffset && "Previous placement doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14143, __PRETTY_FUNCTION__))
;
14144 }
14145 // Note that this correctly re-maps both when we do a swap and when
14146 // we observe the other side of the swap above. We rely on that to
14147 // avoid swapping the members of the input list directly.
14148 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14149 }
14150
14151 // Map the input's dword into the correct half.
14152 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14153 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14154 else
14155 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14157, __PRETTY_FUNCTION__))
14156 Input / 2 &&((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14157, __PRETTY_FUNCTION__))
14157 "Previous placement doesn't match!")((PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input
/ 2 && "Previous placement doesn't match!") ? static_cast
<void> (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14157, __PRETTY_FUNCTION__))
;
14158 }
14159
14160 // And just directly shift any other-half mask elements to be same-half
14161 // as we will have mirrored the dword containing the element into the
14162 // same position within that half.
14163 for (int &M : HalfMask)
14164 if (M >= SourceOffset && M < SourceOffset + 4) {
14165 M = M - SourceOffset + DestOffset;
14166 assert(M >= 0 && "This should never wrap below zero!")((M >= 0 && "This should never wrap below zero!") ?
static_cast<void> (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14166, __PRETTY_FUNCTION__))
;
14167 }
14168 return;
14169 }
14170
14171 // Ensure we have the input in a viable dword of its current half. This
14172 // is particularly tricky because the original position may be clobbered
14173 // by inputs being moved and *staying* in that half.
14174 if (IncomingInputs.size() == 1) {
14175 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14176 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14177 SourceOffset;
14178 SourceHalfMask[InputFixed - SourceOffset] =
14179 IncomingInputs[0] - SourceOffset;
14180 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
14181 InputFixed);
14182 IncomingInputs[0] = InputFixed;
14183 }
14184 } else if (IncomingInputs.size() == 2) {
14185 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14186 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14187 // We have two non-adjacent or clobbered inputs we need to extract from
14188 // the source half. To do this, we need to map them into some adjacent
14189 // dword slot in the source mask.
14190 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14191 IncomingInputs[1] - SourceOffset};
14192
14193 // If there is a free slot in the source half mask adjacent to one of
14194 // the inputs, place the other input in it. We use (Index XOR 1) to
14195 // compute an adjacent index.
14196 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14197 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14198 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14199 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14200 InputsFixed[1] = InputsFixed[0] ^ 1;
14201 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14202 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14203 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14204 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14205 InputsFixed[0] = InputsFixed[1] ^ 1;
14206 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14207 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14208 // The two inputs are in the same DWord but it is clobbered and the
14209 // adjacent DWord isn't used at all. Move both inputs to the free
14210 // slot.
14211 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14212 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14213 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14214 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14215 } else {
14216 // The only way we hit this point is if there is no clobbering
14217 // (because there are no off-half inputs to this half) and there is no
14218 // free slot adjacent to one of the inputs. In this case, we have to
14219 // swap an input with a non-input.
14220 for (int i = 0; i < 4; ++i)
14221 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14222, __PRETTY_FUNCTION__))
14222 "We can't handle any clobbers here!")(((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
"We can't handle any clobbers here!") ? static_cast<void>
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14222, __PRETTY_FUNCTION__))
;
14223 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14224, __PRETTY_FUNCTION__))
14224 "Cannot have adjacent inputs here!")((InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"
) ? static_cast<void> (0) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14224, __PRETTY_FUNCTION__))
;
14225
14226 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14227 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14228
14229 // We also have to update the final source mask in this case because
14230 // it may need to undo the above swap.
14231 for (int &M : FinalSourceHalfMask)
14232 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14233 M = InputsFixed[1] + SourceOffset;
14234 else if (M == InputsFixed[1] + SourceOffset)
14235 M = (InputsFixed[0] ^ 1) + SourceOffset;
14236
14237 InputsFixed[1] = InputsFixed[0] ^ 1;
14238 }
14239
14240 // Point everything at the fixed inputs.
14241 for (int &M : HalfMask)
14242 if (M == IncomingInputs[0])
14243 M = InputsFixed[0] + SourceOffset;
14244 else if (M == IncomingInputs[1])
14245 M = InputsFixed[1] + SourceOffset;
14246
14247 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14248 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14249 }
14250 } else {
14251 llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14251)
;
14252 }
14253
14254 // Now hoist the DWord down to the right half.
14255 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14256 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")((PSHUFDMask[FreeDWord] < 0 && "DWord not free") ?
static_cast<void> (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14256, __PRETTY_FUNCTION__))
;
14257 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14258 for (int &M : HalfMask)
14259 for (int Input : IncomingInputs)
14260 if (M == Input)
14261 M = FreeDWord * 2 + Input % 2;
14262 };
14263 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14264 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14265 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14266 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14267
14268 // Now enact all the shuffles we've computed to move the inputs into their
14269 // target half.
14270 if (!isNoopShuffleMask(PSHUFLMask))
14271 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14272 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14273 if (!isNoopShuffleMask(PSHUFHMask))
14274 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14275 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14276 if (!isNoopShuffleMask(PSHUFDMask))
14277 V = DAG.getBitcast(
14278 VT,
14279 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14280 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14281
14282 // At this point, each half should contain all its inputs, and we can then
14283 // just shuffle them into their final position.
14284 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&((count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!") ?
static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14285, __PRETTY_FUNCTION__))
14285 "Failed to lift all the high half inputs to the low mask!")((count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
"Failed to lift all the high half inputs to the low mask!") ?
static_cast<void> (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14285, __PRETTY_FUNCTION__))
;
14286 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&((count_if(HiMask, [](int M) { return M >= 0 && M <
4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14287, __PRETTY_FUNCTION__))
14287 "Failed to lift all the low half inputs to the high mask!")((count_if(HiMask, [](int M) { return M >= 0 && M <
4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? static_cast<void> (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14287, __PRETTY_FUNCTION__))
;
14288
14289 // Do a half shuffle for the low mask.
14290 if (!isNoopShuffleMask(LoMask))
14291 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14292 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14293
14294 // Do a half shuffle with the high mask after shifting its values down.
14295 for (int &M : HiMask)
14296 if (M >= 0)
14297 M -= 4;
14298 if (!isNoopShuffleMask(HiMask))
14299 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14300 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14301
14302 return V;
14303}
14304
14305/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14306/// blend if only one input is used.
14307static SDValue lowerShuffleAsBlendOfPSHUFBs(
14308 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14309 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14310 assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&((!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14311, __PRETTY_FUNCTION__))
14311 "Lane crossing shuffle masks not supported")((!is128BitLaneCrossingShuffleMask(VT, Mask) && "Lane crossing shuffle masks not supported"
) ? static_cast<void> (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14311, __PRETTY_FUNCTION__))
;
14312
14313 int NumBytes = VT.getSizeInBits() / 8;
14314 int Size = Mask.size();
14315 int Scale = NumBytes / Size;
14316
14317 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14318 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14319 V1InUse = false;
14320 V2InUse = false;
14321
14322 for (int i = 0; i < NumBytes; ++i) {
14323 int M = Mask[i / Scale];
14324 if (M < 0)
14325 continue;
14326
14327 const int ZeroMask = 0x80;
14328 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14329 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14330 if (Zeroable[i / Scale])
14331 V1Idx = V2Idx = ZeroMask;
14332
14333 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14334 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14335 V1InUse |= (ZeroMask != V1Idx);
14336 V2InUse |= (ZeroMask != V2Idx);
14337 }
14338
14339 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14340 if (V1InUse)
14341 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14342 DAG.getBuildVector(ShufVT, DL, V1Mask));
14343 if (V2InUse)
14344 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14345 DAG.getBuildVector(ShufVT, DL, V2Mask));
14346
14347 // If we need shuffled inputs from both, blend the two.
14348 SDValue V;
14349 if (V1InUse && V2InUse)
14350 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14351 else
14352 V = V1InUse ? V1 : V2;
14353
14354 // Cast the result back to the correct type.
14355 return DAG.getBitcast(VT, V);
14356}
14357
14358/// Generic lowering of 8-lane i16 shuffles.
14359///
14360/// This handles both single-input shuffles and combined shuffle/blends with
14361/// two inputs. The single input shuffles are immediately delegated to
14362/// a dedicated lowering routine.
14363///
14364/// The blends are lowered in one of three fundamental ways. If there are few
14365/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14366/// of the input is significantly cheaper when lowered as an interleaving of
14367/// the two inputs, try to interleave them. Otherwise, blend the low and high
14368/// halves of the inputs separately (making them have relatively few inputs)
14369/// and then concatenate them.
14370static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14371 const APInt &Zeroable, SDValue V1, SDValue V2,
14372 const X86Subtarget &Subtarget,
14373 SelectionDAG &DAG) {
14374 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14374, __PRETTY_FUNCTION__))
;
14375 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14375, __PRETTY_FUNCTION__))
;
14376 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14376, __PRETTY_FUNCTION__))
;
14377
14378 // Whenever we can lower this as a zext, that instruction is strictly faster
14379 // than any alternative.
14380 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14381 Zeroable, Subtarget, DAG))
14382 return ZExt;
14383
14384 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14385
14386 if (NumV2Inputs == 0) {
14387 // Try to use shift instructions.
14388 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
14389 Zeroable, Subtarget, DAG))
14390 return Shift;
14391
14392 // Check for being able to broadcast a single element.
14393 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14394 Mask, Subtarget, DAG))
14395 return Broadcast;
14396
14397 // Try to use bit rotation instructions.
14398 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14399 Subtarget, DAG))
14400 return Rotate;
14401
14402 // Use dedicated unpack instructions for masks that match their pattern.
14403 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
14404 return V;
14405
14406 // Use dedicated pack instructions for masks that match their pattern.
14407 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
14408 Subtarget))
14409 return V;
14410
14411 // Try to use byte rotation instructions.
14412 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14413 Subtarget, DAG))
14414 return Rotate;
14415
14416 // Make a copy of the mask so it can be modified.
14417 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
14418 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14419 Subtarget, DAG);
14420 }
14421
14422 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14424, __PRETTY_FUNCTION__))
14423 "All single-input shuffles should be canonicalized to be V1-input "((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14424, __PRETTY_FUNCTION__))
14424 "shuffles.")((llvm::any_of(Mask, [](int M) { return M >= 0 && M
< 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? static_cast<void> (0) : __assert_fail (
"llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14424, __PRETTY_FUNCTION__))
;
14425
14426 // Try to use shift instructions.
14427 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
14428 Zeroable, Subtarget, DAG))
14429 return Shift;
14430
14431 // See if we can use SSE4A Extraction / Insertion.
14432 if (Subtarget.hasSSE4A())
14433 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14434 Zeroable, DAG))
14435 return V;
14436
14437 // There are special ways we can lower some single-element blends.
14438 if (NumV2Inputs == 1)
14439 if (SDValue V = lowerShuffleAsElementInsertion(
14440 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14441 return V;
14442
14443 // We have different paths for blend lowering, but they all must use the
14444 // *exact* same predicate.
14445 bool IsBlendSupported = Subtarget.hasSSE41();
14446 if (IsBlendSupported)
14447 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14448 Zeroable, Subtarget, DAG))
14449 return Blend;
14450
14451 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14452 Zeroable, Subtarget, DAG))
14453 return Masked;
14454
14455 // Use dedicated unpack instructions for masks that match their pattern.
14456 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
14457 return V;
14458
14459 // Use dedicated pack instructions for masks that match their pattern.
14460 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
14461 Subtarget))
14462 return V;
14463
14464 // Try to use byte rotation instructions.
14465 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14466 Subtarget, DAG))
14467 return Rotate;
14468
14469 if (SDValue BitBlend =
14470 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14471 return BitBlend;
14472
14473 // Try to use byte shift instructions to mask.
14474 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14475 Zeroable, Subtarget, DAG))
14476 return V;
14477
14478 // Try to lower by permuting the inputs into an unpack instruction.
14479 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14480 Mask, Subtarget, DAG))
14481 return Unpack;
14482
14483 // If we can't directly blend but can use PSHUFB, that will be better as it
14484 // can both shuffle and set up the inefficient blend.
14485 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14486 bool V1InUse, V2InUse;
14487 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14488 Zeroable, DAG, V1InUse, V2InUse);
14489 }
14490
14491 // We can always bit-blend if we have to so the fallback strategy is to
14492 // decompose into single-input permutes and blends.
14493 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
14494 Mask, Subtarget, DAG);
14495}
14496
14497/// Check whether a compaction lowering can be done by dropping even
14498/// elements and compute how many times even elements must be dropped.
14499///
14500/// This handles shuffles which take every Nth element where N is a power of
14501/// two. Example shuffle masks:
14502///
14503/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
14504/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
14505/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
14506/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
14507/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
14508/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
14509///
14510/// Any of these lanes can of course be undef.
14511///
14512/// This routine only supports N <= 3.
14513/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
14514/// for larger N.
14515///
14516/// \returns N above, or the number of times even elements must be dropped if
14517/// there is such a number. Otherwise returns zero.
14518static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
14519 bool IsSingleInput) {
14520 // The modulus for the shuffle vector entries is based on whether this is
14521 // a single input or not.
14522 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
14523 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14524, __PRETTY_FUNCTION__))
14524 "We should only be called with masks with a power-of-2 size!")((isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14524, __PRETTY_FUNCTION__))
;
14525
14526 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
14527
14528 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
14529 // and 2^3 simultaneously. This is because we may have ambiguity with
14530 // partially undef inputs.
14531 bool ViableForN[3] = {true, true, true};
14532
14533 for (int i = 0, e = Mask.size(); i < e; ++i) {
14534 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
14535 // want.
14536 if (Mask[i] < 0)
14537 continue;
14538
14539 bool IsAnyViable = false;
14540 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
14541 if (ViableForN[j]) {
14542 uint64_t N = j + 1;
14543
14544 // The shuffle mask must be equal to (i * 2^N) % M.
14545 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
14546 IsAnyViable = true;
14547 else
14548 ViableForN[j] = false;
14549 }
14550 // Early exit if we exhaust the possible powers of two.
14551 if (!IsAnyViable)
14552 break;
14553 }
14554
14555 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
14556 if (ViableForN[j])
14557 return j + 1;
14558
14559 // Return 0 as there is no viable power of two.
14560 return 0;
14561}
14562
14563static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
14564 ArrayRef<int> Mask, SDValue V1,
14565 SDValue V2, SelectionDAG &DAG) {
14566 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
14567 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
14568
14569 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
14570 if (V2.isUndef())
14571 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
14572
14573 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
14574}
14575
14576/// Generic lowering of v16i8 shuffles.
14577///
14578/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14579/// detect any complexity reducing interleaving. If that doesn't help, it uses
14580/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14581/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14582/// back together.
14583static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
14584 const APInt &Zeroable, SDValue V1, SDValue V2,
14585 const X86Subtarget &Subtarget,
14586 SelectionDAG &DAG) {
14587 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14587, __PRETTY_FUNCTION__))
;
14588 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14588, __PRETTY_FUNCTION__))
;
14589 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14589, __PRETTY_FUNCTION__))
;
14590
14591 // Try to use shift instructions.
14592 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
14593 Zeroable, Subtarget, DAG))
14594 return Shift;
14595
14596 // Try to use byte rotation instructions.
14597 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14598 Subtarget, DAG))
14599 return Rotate;
14600
14601 // Use dedicated pack instructions for masks that match their pattern.
14602 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
14603 Subtarget))
14604 return V;
14605
14606 // Try to use a zext lowering.
14607 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14608 Zeroable, Subtarget, DAG))
14609 return ZExt;
14610
14611 // See if we can use SSE4A Extraction / Insertion.
14612 if (Subtarget.hasSSE4A())
14613 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14614 Zeroable, DAG))
14615 return V;
14616
14617 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14618
14619 // For single-input shuffles, there are some nicer lowering tricks we can use.
14620 if (NumV2Elements == 0) {
14621 // Check for being able to broadcast a single element.
14622 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14623 Mask, Subtarget, DAG))
14624 return Broadcast;
14625
14626 // Try to use bit rotation instructions.
14627 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14628 Subtarget, DAG))
14629 return Rotate;
14630
14631 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14632 return V;
14633
14634 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14635 // Notably, this handles splat and partial-splat shuffles more efficiently.
14636 // However, it only makes sense if the pre-duplication shuffle simplifies
14637 // things significantly. Currently, this means we need to be able to
14638 // express the pre-duplication shuffle as an i16 shuffle.
14639 //
14640 // FIXME: We should check for other patterns which can be widened into an
14641 // i16 shuffle as well.
14642 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14643 for (int i = 0; i < 16; i += 2)
14644 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14645 return false;
14646
14647 return true;
14648 };
14649 auto tryToWidenViaDuplication = [&]() -> SDValue {
14650 if (!canWidenViaDuplication(Mask))
14651 return SDValue();
14652 SmallVector<int, 4> LoInputs;
14653 copy_if(Mask, std::back_inserter(LoInputs),
14654 [](int M) { return M >= 0 && M < 8; });
14655 array_pod_sort(LoInputs.begin(), LoInputs.end());
14656 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
14657 LoInputs.end());
14658 SmallVector<int, 4> HiInputs;
14659 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14660 array_pod_sort(HiInputs.begin(), HiInputs.end());
14661 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
14662 HiInputs.end());
14663
14664 bool TargetLo = LoInputs.size() >= HiInputs.size();
14665 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14666 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14667
14668 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14669 SmallDenseMap<int, int, 8> LaneMap;
14670 for (int I : InPlaceInputs) {
14671 PreDupI16Shuffle[I/2] = I/2;
14672 LaneMap[I] = I;
14673 }
14674 int j = TargetLo ? 0 : 4, je = j + 4;
14675 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14676 // Check if j is already a shuffle of this input. This happens when
14677 // there are two adjacent bytes after we move the low one.
14678 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14679 // If we haven't yet mapped the input, search for a slot into which
14680 // we can map it.
14681 while (j < je && PreDupI16Shuffle[j] >= 0)
14682 ++j;
14683
14684 if (j == je)
14685 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14686 return SDValue();
14687
14688 // Map this input with the i16 shuffle.
14689 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14690 }
14691
14692 // Update the lane map based on the mapping we ended up with.
14693 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14694 }
14695 V1 = DAG.getBitcast(
14696 MVT::v16i8,
14697 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14698 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14699
14700 // Unpack the bytes to form the i16s that will be shuffled into place.
14701 bool EvenInUse = false, OddInUse = false;
14702 for (int i = 0; i < 16; i += 2) {
14703 EvenInUse |= (Mask[i + 0] >= 0);
14704 OddInUse |= (Mask[i + 1] >= 0);
14705 if (EvenInUse && OddInUse)
14706 break;
14707 }
14708 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14709 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14710 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14711
14712 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14713 for (int i = 0; i < 16; ++i)
14714 if (Mask[i] >= 0) {
14715 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14716 assert(MappedMask < 8 && "Invalid v8 shuffle mask!")((MappedMask < 8 && "Invalid v8 shuffle mask!") ? static_cast
<void> (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14716, __PRETTY_FUNCTION__))
;
14717 if (PostDupI16Shuffle[i / 2] < 0)
14718 PostDupI16Shuffle[i / 2] = MappedMask;
14719 else
14720 assert(PostDupI16Shuffle[i / 2] == MappedMask &&((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14721, __PRETTY_FUNCTION__))
14721 "Conflicting entries in the original shuffle!")((PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!"
) ? static_cast<void> (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14721, __PRETTY_FUNCTION__))
;
14722 }
14723 return DAG.getBitcast(
14724 MVT::v16i8,
14725 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14726 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14727 };
14728 if (SDValue V = tryToWidenViaDuplication())
14729 return V;
14730 }
14731
14732 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14733 Zeroable, Subtarget, DAG))
14734 return Masked;
14735
14736 // Use dedicated unpack instructions for masks that match their pattern.
14737 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14738 return V;
14739
14740 // Try to use byte shift instructions to mask.
14741 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14742 Zeroable, Subtarget, DAG))
14743 return V;
14744
14745 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14746 // with PSHUFB. It is important to do this before we attempt to generate any
14747 // blends but after all of the single-input lowerings. If the single input
14748 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14749 // want to preserve that and we can DAG combine any longer sequences into
14750 // a PSHUFB in the end. But once we start blending from multiple inputs,
14751 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14752 // and there are *very* few patterns that would actually be faster than the
14753 // PSHUFB approach because of its ability to zero lanes.
14754 //
14755 // FIXME: The only exceptions to the above are blends which are exact
14756 // interleavings with direct instructions supporting them. We currently don't
14757 // handle those well here.
14758 if (Subtarget.hasSSSE3()) {
14759 bool V1InUse = false;
14760 bool V2InUse = false;
14761
14762 SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(
14763 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14764
14765 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14766 // do so. This avoids using them to handle blends-with-zero which is
14767 // important as a single pshufb is significantly faster for that.
14768 if (V1InUse && V2InUse) {
14769 if (Subtarget.hasSSE41())
14770 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14771 Zeroable, Subtarget, DAG))
14772 return Blend;
14773
14774 // We can use an unpack to do the blending rather than an or in some
14775 // cases. Even though the or may be (very minorly) more efficient, we
14776 // preference this lowering because there are common cases where part of
14777 // the complexity of the shuffles goes away when we do the final blend as
14778 // an unpack.
14779 // FIXME: It might be worth trying to detect if the unpack-feeding
14780 // shuffles will both be pshufb, in which case we shouldn't bother with
14781 // this.
14782 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(
14783 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14784 return Unpack;
14785
14786 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
14787 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
14788 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
14789
14790 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14791 // PALIGNR will be cheaper than the second PSHUFB+OR.
14792 if (SDValue V = lowerShuffleAsByteRotateAndPermute(
14793 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14794 return V;
14795 }
14796
14797 return PSHUFB;
14798 }
14799
14800 // There are special ways we can lower some single-element blends.
14801 if (NumV2Elements == 1)
14802 if (SDValue V = lowerShuffleAsElementInsertion(
14803 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14804 return V;
14805
14806 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14807 return Blend;
14808
14809 // Check whether a compaction lowering can be done. This handles shuffles
14810 // which take every Nth element for some even N. See the helper function for
14811 // details.
14812 //
14813 // We special case these as they can be particularly efficiently handled with
14814 // the PACKUSB instruction on x86 and they show up in common patterns of
14815 // rearranging bytes to truncate wide elements.
14816 bool IsSingleInput = V2.isUndef();
14817 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
14818 // NumEvenDrops is the power of two stride of the elements. Another way of
14819 // thinking about it is that we need to drop the even elements this many
14820 // times to get the original input.
14821
14822 // First we need to zero all the dropped bytes.
14823 assert(NumEvenDrops <= 3 &&((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14824, __PRETTY_FUNCTION__))
14824 "No support for dropping even elements more than 3 times.")((NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? static_cast<void> (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14824, __PRETTY_FUNCTION__))
;
14825 SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));
14826 for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)
14827 ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);
14828 SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);
14829 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
14830 if (!IsSingleInput)
14831 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
14832
14833 // Now pack things back together.
14834 V1 = DAG.getBitcast(MVT::v8i16, V1);
14835 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
14836 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
14837 for (int i = 1; i < NumEvenDrops; ++i) {
14838 Result = DAG.getBitcast(MVT::v8i16, Result);
14839 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14840 }
14841
14842 return Result;
14843 }
14844
14845 // Handle multi-input cases by blending single-input shuffles.
14846 if (NumV2Elements > 0)
14847 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask,
14848 Subtarget, DAG);
14849
14850 // The fallback path for single-input shuffles widens this into two v8i16
14851 // vectors with unpacks, shuffles those, and then pulls them back together
14852 // with a pack.
14853 SDValue V = V1;
14854
14855 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14856 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14857 for (int i = 0; i < 16; ++i)
14858 if (Mask[i] >= 0)
14859 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14860
14861 SDValue VLoHalf, VHiHalf;
14862 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14863 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14864 // i16s.
14865 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14866 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14867 // Use a mask to drop the high bytes.
14868 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14869 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14870 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14871
14872 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14873 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14874
14875 // Squash the masks to point directly into VLoHalf.
14876 for (int &M : LoBlendMask)
14877 if (M >= 0)
14878 M /= 2;
14879 for (int &M : HiBlendMask)
14880 if (M >= 0)
14881 M /= 2;
14882 } else {
14883 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14884 // VHiHalf so that we can blend them as i16s.
14885 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14886
14887 VLoHalf = DAG.getBitcast(
14888 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14889 VHiHalf = DAG.getBitcast(
14890 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14891 }
14892
14893 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14894 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14895
14896 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14897}
14898
14899/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14900///
14901/// This routine breaks down the specific type of 128-bit shuffle and
14902/// dispatches to the lowering routines accordingly.
14903static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14904 MVT VT, SDValue V1, SDValue V2,
14905 const APInt &Zeroable,
14906 const X86Subtarget &Subtarget,
14907 SelectionDAG &DAG) {
14908 switch (VT.SimpleTy) {
14909 case MVT::v2i64:
14910 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14911 case MVT::v2f64:
14912 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14913 case MVT::v4i32:
14914 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14915 case MVT::v4f32:
14916 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14917 case MVT::v8i16:
14918 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14919 case MVT::v16i8:
14920 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14921
14922 default:
14923 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14923)
;
14924 }
14925}
14926
14927/// Generic routine to split vector shuffle into half-sized shuffles.
14928///
14929/// This routine just extracts two subvectors, shuffles them independently, and
14930/// then concatenates them back together. This should work effectively with all
14931/// AVX vector shuffle types.
14932static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
14933 SDValue V2, ArrayRef<int> Mask,
14934 SelectionDAG &DAG) {
14935 assert(VT.getSizeInBits() >= 256 &&((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14936, __PRETTY_FUNCTION__))
14936 "Only for 256-bit or wider vector shuffles!")((VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14936, __PRETTY_FUNCTION__))
;
14937 assert(V1.getSimpleValueType() == VT && "Bad operand type!")((V1.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14937, __PRETTY_FUNCTION__))
;
14938 assert(V2.getSimpleValueType() == VT && "Bad operand type!")((V2.getSimpleValueType() == VT && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 14938, __PRETTY_FUNCTION__))
;
14939
14940 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14941 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14942
14943 int NumElements = VT.getVectorNumElements();
14944 int SplitNumElements = NumElements / 2;
14945 MVT ScalarVT = VT.getVectorElementType();
14946 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
14947
14948 // Rather than splitting build-vectors, just build two narrower build
14949 // vectors. This helps shuffling with splats and zeros.
14950 auto SplitVector = [&](SDValue V) {
14951 V = peekThroughBitcasts(V);
14952
14953 MVT OrigVT = V.getSimpleValueType();
14954 int OrigNumElements = OrigVT.getVectorNumElements();
14955 int OrigSplitNumElements = OrigNumElements / 2;
14956 MVT OrigScalarVT = OrigVT.getVectorElementType();
14957 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
14958
14959 SDValue LoV, HiV;
14960
14961 auto *BV = dyn_cast<BuildVectorSDNode>(V);
14962 if (!BV) {
14963 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
14964 DAG.getIntPtrConstant(0, DL));
14965 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
14966 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
14967 } else {
14968
14969 SmallVector<SDValue, 16> LoOps, HiOps;
14970 for (int i = 0; i < OrigSplitNumElements; ++i) {
14971 LoOps.push_back(BV->getOperand(i));
14972 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
14973 }
14974 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
14975 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
14976 }
14977 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14978 DAG.getBitcast(SplitVT, HiV));
14979 };
14980
14981 SDValue LoV1, HiV1, LoV2, HiV2;
14982 std::tie(LoV1, HiV1) = SplitVector(V1);
14983 std::tie(LoV2, HiV2) = SplitVector(V2);
14984
14985 // Now create two 4-way blends of these half-width vectors.
14986 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14987 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
14988 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14989 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14990 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14991 for (int i = 0; i < SplitNumElements; ++i) {
14992 int M = HalfMask[i];
14993 if (M >= NumElements) {
14994 if (M >= NumElements + SplitNumElements)
14995 UseHiV2 = true;
14996 else
14997 UseLoV2 = true;
14998 V2BlendMask[i] = M - NumElements;
14999 BlendMask[i] = SplitNumElements + i;
15000 } else if (M >= 0) {
15001 if (M >= SplitNumElements)
15002 UseHiV1 = true;
15003 else
15004 UseLoV1 = true;
15005 V1BlendMask[i] = M;
15006 BlendMask[i] = i;
15007 }
15008 }
15009
15010 // Because the lowering happens after all combining takes place, we need to
15011 // manually combine these blend masks as much as possible so that we create
15012 // a minimal number of high-level vector shuffle nodes.
15013
15014 // First try just blending the halves of V1 or V2.
15015 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15016 return DAG.getUNDEF(SplitVT);
15017 if (!UseLoV2 && !UseHiV2)
15018 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15019 if (!UseLoV1 && !UseHiV1)
15020 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15021
15022 SDValue V1Blend, V2Blend;
15023 if (UseLoV1 && UseHiV1) {
15024 V1Blend =
15025 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15026 } else {
15027 // We only use half of V1 so map the usage down into the final blend mask.
15028 V1Blend = UseLoV1 ? LoV1 : HiV1;
15029 for (int i = 0; i < SplitNumElements; ++i)
15030 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15031 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15032 }
15033 if (UseLoV2 && UseHiV2) {
15034 V2Blend =
15035 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15036 } else {
15037 // We only use half of V2 so map the usage down into the final blend mask.
15038 V2Blend = UseLoV2 ? LoV2 : HiV2;
15039 for (int i = 0; i < SplitNumElements; ++i)
15040 if (BlendMask[i] >= SplitNumElements)
15041 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15042 }
15043 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15044 };
15045 SDValue Lo = HalfBlend(LoMask);
15046 SDValue Hi = HalfBlend(HiMask);
15047 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15048}
15049
15050/// Either split a vector in halves or decompose the shuffles and the
15051/// blend.
15052///
15053/// This is provided as a good fallback for many lowerings of non-single-input
15054/// shuffles with more than one 128-bit lane. In those cases, we want to select
15055/// between splitting the shuffle into 128-bit components and stitching those
15056/// back together vs. extracting the single-input shuffles and blending those
15057/// results.
15058static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
15059 SDValue V2, ArrayRef<int> Mask,
15060 const X86Subtarget &Subtarget,
15061 SelectionDAG &DAG) {
15062 assert(!V2.isUndef() && "This routine must not be used to lower single-input "((!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? static_cast
<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15063, __PRETTY_FUNCTION__))
15063 "shuffles as it could then recurse on itself.")((!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? static_cast
<void> (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15063, __PRETTY_FUNCTION__))
;
15064 int Size = Mask.size();
15065
15066 // If this can be modeled as a broadcast of two elements followed by a blend,
15067 // prefer that lowering. This is especially important because broadcasts can
15068 // often fold with memory operands.
15069 auto DoBothBroadcast = [&] {
15070 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15071 for (int M : Mask)
15072 if (M >= Size) {
15073 if (V2BroadcastIdx < 0)
15074 V2BroadcastIdx = M - Size;
15075 else if (M - Size != V2BroadcastIdx)
15076 return false;
15077 } else if (M >= 0) {
15078 if (V1BroadcastIdx < 0)
15079 V1BroadcastIdx = M;
15080 else if (M != V1BroadcastIdx)
15081 return false;
15082 }
15083 return true;
15084 };
15085 if (DoBothBroadcast())
15086 return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
15087 Subtarget, DAG);
15088
15089 // If the inputs all stem from a single 128-bit lane of each input, then we
15090 // split them rather than blending because the split will decompose to
15091 // unusually few instructions.
15092 int LaneCount = VT.getSizeInBits() / 128;
15093 int LaneSize = Size / LaneCount;
15094 SmallBitVector LaneInputs[2];
15095 LaneInputs[0].resize(LaneCount, false);
15096 LaneInputs[1].resize(LaneCount, false);
15097 for (int i = 0; i < Size; ++i)
15098 if (Mask[i] >= 0)
15099 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15100 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15101 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15102
15103 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
15104 // that the decomposed single-input shuffles don't end up here.
15105 return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget,
15106 DAG);
15107}
15108
15109// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15110// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15111static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
15112 SDValue V1, SDValue V2,
15113 ArrayRef<int> Mask,
15114 SelectionDAG &DAG) {
15115 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")((VT == MVT::v4f64 && "Only for v4f64 shuffles") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15115, __PRETTY_FUNCTION__))
;
15116
15117 int LHSMask[4] = {-1, -1, -1, -1};
15118 int RHSMask[4] = {-1, -1, -1, -1};
15119 unsigned SHUFPMask = 0;
15120
15121 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15122 // perform the shuffle once the lanes have been shuffled in place.
15123 for (int i = 0; i != 4; ++i) {
15124 int M = Mask[i];
15125 if (M < 0)
15126 continue;
15127 int LaneBase = i & ~1;
15128 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15129 LaneMask[LaneBase + (M & 1)] = M;
15130 SHUFPMask |= (M & 1) << i;
15131 }
15132
15133 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15134 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15135 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15136 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
15137}
15138
15139/// Lower a vector shuffle crossing multiple 128-bit lanes as
15140/// a lane permutation followed by a per-lane permutation.
15141///
15142/// This is mainly for cases where we can have non-repeating permutes
15143/// in each lane.
15144///
15145/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15146/// we should investigate merging them.
15147static SDValue lowerShuffleAsLanePermuteAndPermute(
15148 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15149 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15150 int NumElts = VT.getVectorNumElements();
15151 int NumLanes = VT.getSizeInBits() / 128;
15152 int NumEltsPerLane = NumElts / NumLanes;
15153
15154 SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
15155 SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
15156
15157 for (int i = 0; i != NumElts; ++i) {
15158 int M = Mask[i];
15159 if (M < 0)
15160 continue;
15161
15162 // Ensure that each lane comes from a single source lane.
15163 int SrcLane = M / NumEltsPerLane;
15164 int DstLane = i / NumEltsPerLane;
15165 if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
15166 return SDValue();
15167 SrcLaneMask[DstLane] = SrcLane;
15168
15169 PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
15170 }
15171
15172 // Make sure we set all elements of the lane mask, to avoid undef propagation.
15173 SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
15174 for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
15175 int SrcLane = SrcLaneMask[DstLane];
15176 if (0 <= SrcLane)
15177 for (int j = 0; j != NumEltsPerLane; ++j) {
15178 LaneMask[(DstLane * NumEltsPerLane) + j] =
15179 (SrcLane * NumEltsPerLane) + j;
15180 }
15181 }
15182
15183 // If we're only shuffling a single lowest lane and the rest are identity
15184 // then don't bother.
15185 // TODO - isShuffleMaskInputInPlace could be extended to something like this.
15186 int NumIdentityLanes = 0;
15187 bool OnlyShuffleLowestLane = true;
15188 for (int i = 0; i != NumLanes; ++i) {
15189 if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
15190 i * NumEltsPerLane))
15191 NumIdentityLanes++;
15192 else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
15193 OnlyShuffleLowestLane = false;
15194 }
15195 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15196 return SDValue();
15197
15198 SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
15199 return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
15200}
15201
15202/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15203/// source with a lane permutation.
15204///
15205/// This lowering strategy results in four instructions in the worst case for a
15206/// single-input cross lane shuffle which is lower than any other fully general
15207/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15208/// shuffle pattern should be handled prior to trying this lowering.
15209static SDValue lowerShuffleAsLanePermuteAndShuffle(
15210 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15211 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15212 // FIXME: This should probably be generalized for 512-bit vectors as well.
15213 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")((VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15213, __PRETTY_FUNCTION__))
;
15214 int Size = Mask.size();
15215 int LaneSize = Size / 2;
15216
15217 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15218 // Only do this if the elements aren't all from the lower lane,
15219 // otherwise we're (probably) better off doing a split.
15220 if (VT == MVT::v4f64 &&
15221 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15222 if (SDValue V =
15223 lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
15224 return V;
15225
15226 // If there are only inputs from one 128-bit lane, splitting will in fact be
15227 // less expensive. The flags track whether the given lane contains an element
15228 // that crosses to another lane.
15229 if (!Subtarget.hasAVX2()) {
15230 bool LaneCrossing[2] = {false, false};
15231 for (int i = 0; i < Size; ++i)
15232 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15233 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15234 if (!LaneCrossing[0] || !LaneCrossing[1])
15235 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15236 } else {
15237 bool LaneUsed[2] = {false, false};
15238 for (int i = 0; i < Size; ++i)
15239 if (Mask[i] >= 0)
15240 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15241 if (!LaneUsed[0] || !LaneUsed[1])
15242 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
15243 }
15244
15245 // TODO - we could support shuffling V2 in the Flipped input.
15246 assert(V2.isUndef() &&((V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15247, __PRETTY_FUNCTION__))
15247 "This last part of this routine only works on single input shuffles")((V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? static_cast<void> (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15247, __PRETTY_FUNCTION__))
;
15248
15249 SmallVector<int, 32> InLaneMask(Mask.begin(), Mask.end());
15250 for (int i = 0; i < Size; ++i) {
15251 int &M = InLaneMask[i];
15252 if (M < 0)
15253 continue;
15254 if (((M % Size) / LaneSize) != (i / LaneSize))
15255 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15256 }
15257 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&((!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected") ? static_cast<void> (
0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15258, __PRETTY_FUNCTION__))
15258 "In-lane shuffle mask expected")((!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected") ? static_cast<void> (
0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15258, __PRETTY_FUNCTION__))
;
15259
15260 // Flip the lanes, and shuffle the results which should now be in-lane.
15261 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15262 SDValue Flipped = DAG.getBitcast(PVT, V1);
15263 Flipped =
15264 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15265 Flipped = DAG.getBitcast(VT, Flipped);
15266 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15267}
15268
15269/// Handle lowering 2-lane 128-bit shuffles.
15270static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
15271 SDValue V2, ArrayRef<int> Mask,
15272 const APInt &Zeroable,
15273 const X86Subtarget &Subtarget,
15274 SelectionDAG &DAG) {
15275 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15276 if (Subtarget.hasAVX2() && V2.isUndef())
15277 return SDValue();
15278
15279 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15280
15281 SmallVector<int, 4> WidenedMask;
15282 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15283 return SDValue();
15284
15285 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15286 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15287
15288 // Try to use an insert into a zero vector.
15289 if (WidenedMask[0] == 0 && IsHighZero) {
15290 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15291 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15292 DAG.getIntPtrConstant(0, DL));
15293 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15294 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15295 DAG.getIntPtrConstant(0, DL));
15296 }
15297
15298 // TODO: If minimizing size and one of the inputs is a zero vector and the
15299 // the zero vector has only one use, we could use a VPERM2X128 to save the
15300 // instruction bytes needed to explicitly generate the zero vector.
15301
15302 // Blends are faster and handle all the non-lane-crossing cases.
15303 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15304 Subtarget, DAG))
15305 return Blend;
15306
15307 // If either input operand is a zero vector, use VPERM2X128 because its mask
15308 // allows us to replace the zero input with an implicit zero.
15309 if (!IsLowZero && !IsHighZero) {
15310 // Check for patterns which can be matched with a single insert of a 128-bit
15311 // subvector.
15312 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
15313 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
15314
15315 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15316 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15317 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15318 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15319 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
15320 OnlyUsesV1 ? V1 : V2,
15321 DAG.getIntPtrConstant(0, DL));
15322 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15323 DAG.getIntPtrConstant(2, DL));
15324 }
15325 }
15326
15327 // Try to use SHUF128 if possible.
15328 if (Subtarget.hasVLX()) {
15329 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15330 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15331 ((WidenedMask[1] % 2) << 1);
15332 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15333 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15334 }
15335 }
15336 }
15337
15338 // Otherwise form a 128-bit permutation. After accounting for undefs,
15339 // convert the 64-bit shuffle mask selection values into 128-bit
15340 // selection bits by dividing the indexes by 2 and shifting into positions
15341 // defined by a vperm2*128 instruction's immediate control byte.
15342
15343 // The immediate permute control byte looks like this:
15344 // [1:0] - select 128 bits from sources for low half of destination
15345 // [2] - ignore
15346 // [3] - zero low half of destination
15347 // [5:4] - select 128 bits from sources for high half of destination
15348 // [6] - ignore
15349 // [7] - zero high half of destination
15350
15351 assert((WidenedMask[0] >= 0 || IsLowZero) &&(((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask
[1] >= 0 || IsHighZero) && "Undef half?") ? static_cast
<void> (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15352, __PRETTY_FUNCTION__))
15352 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(((WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask
[1] >= 0 || IsHighZero) && "Undef half?") ? static_cast
<void> (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15352, __PRETTY_FUNCTION__))
;
15353
15354 unsigned PermMask = 0;
15355 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15356 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15357
15358 // Check the immediate mask and replace unused sources with undef.
15359 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15360 V1 = DAG.getUNDEF(VT);
15361 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15362 V2 = DAG.getUNDEF(VT);
15363
15364 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15365 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15366}
15367
15368/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15369/// shuffling each lane.
15370///
15371/// This attempts to create a repeated lane shuffle where each lane uses one
15372/// or two of the lanes of the inputs. The lanes of the input vectors are
15373/// shuffled in one or two independent shuffles to get the lanes into the
15374/// position needed by the final shuffle.
15375static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
15376 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15377 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15378 assert(!V2.isUndef() && "This is only useful with multiple inputs.")((!V2.isUndef() && "This is only useful with multiple inputs."
) ? static_cast<void> (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15378, __PRETTY_FUNCTION__))
;
15379
15380 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15381 return SDValue();
15382
15383 int NumElts = Mask.size();
15384 int NumLanes = VT.getSizeInBits() / 128;
15385 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15386 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15387 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15388
15389 // First pass will try to fill in the RepeatMask from lanes that need two
15390 // sources.
15391 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15392 int Srcs[2] = {-1, -1};
15393 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15394 for (int i = 0; i != NumLaneElts; ++i) {
15395 int M = Mask[(Lane * NumLaneElts) + i];
15396 if (M < 0)
15397 continue;
15398 // Determine which of the possible input lanes (NumLanes from each source)
15399 // this element comes from. Assign that as one of the sources for this
15400 // lane. We can assign up to 2 sources for this lane. If we run out
15401 // sources we can't do anything.
15402 int LaneSrc = M / NumLaneElts;
15403 int Src;
15404 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15405 Src = 0;
15406 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15407 Src = 1;
15408 else
15409 return SDValue();
15410
15411 Srcs[Src] = LaneSrc;
15412 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15413 }
15414
15415 // If this lane has two sources, see if it fits with the repeat mask so far.
15416 if (Srcs[1] < 0)
15417 continue;
15418
15419 LaneSrcs[Lane][0] = Srcs[0];
15420 LaneSrcs[Lane][1] = Srcs[1];
15421
15422 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15423 assert(M1.size() == M2.size() && "Unexpected mask size")((M1.size() == M2.size() && "Unexpected mask size") ?
static_cast<void> (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15423, __PRETTY_FUNCTION__))
;
15424 for (int i = 0, e = M1.size(); i != e; ++i)
15425 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15426 return false;
15427 return true;
15428 };
15429
15430 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15431 assert(Mask.size() == MergedMask.size() && "Unexpected mask size")((Mask.size() == MergedMask.size() && "Unexpected mask size"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15431, __PRETTY_FUNCTION__))
;
15432 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15433 int M = Mask[i];
15434 if (M < 0)
15435 continue;
15436 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"
) ? static_cast<void> (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15437, __PRETTY_FUNCTION__))
15437 "Unexpected mask element")(((MergedMask[i] < 0 || MergedMask[i] == M) && "Unexpected mask element"
) ? static_cast<void> (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15437, __PRETTY_FUNCTION__))
;
15438 MergedMask[i] = M;
15439 }
15440 };
15441
15442 if (MatchMasks(InLaneMask, RepeatMask)) {
15443 // Merge this lane mask into the final repeat mask.
15444 MergeMasks(InLaneMask, RepeatMask);
15445 continue;
15446 }
15447
15448 // Didn't find a match. Swap the operands and try again.
15449 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15450 ShuffleVectorSDNode::commuteMask(InLaneMask);
15451
15452 if (MatchMasks(InLaneMask, RepeatMask)) {
15453 // Merge this lane mask into the final repeat mask.
15454 MergeMasks(InLaneMask, RepeatMask);
15455 continue;
15456 }
15457
15458 // Couldn't find a match with the operands in either order.
15459 return SDValue();
15460 }
15461
15462 // Now handle any lanes with only one source.
15463 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15464 // If this lane has already been processed, skip it.
15465 if (LaneSrcs[Lane][0] >= 0)
15466 continue;
15467
15468 for (int i = 0; i != NumLaneElts; ++i) {
15469 int M = Mask[(Lane * NumLaneElts) + i];
15470 if (M < 0)
15471 continue;
15472
15473 // If RepeatMask isn't defined yet we can define it ourself.
15474 if (RepeatMask[i] < 0)
15475 RepeatMask[i] = M % NumLaneElts;
15476
15477 if (RepeatMask[i] < NumElts) {
15478 if (RepeatMask[i] != M % NumLaneElts)
15479 return SDValue();
15480 LaneSrcs[Lane][0] = M / NumLaneElts;
15481 } else {
15482 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15483 return SDValue();
15484 LaneSrcs[Lane][1] = M / NumLaneElts;
15485 }
15486 }
15487
15488 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15489 return SDValue();
15490 }
15491
15492 SmallVector<int, 16> NewMask(NumElts, -1);
15493 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15494 int Src = LaneSrcs[Lane][0];
15495 for (int i = 0; i != NumLaneElts; ++i) {
15496 int M = -1;
15497 if (Src >= 0)
15498 M = Src * NumLaneElts + i;
15499 NewMask[Lane * NumLaneElts + i] = M;
15500 }
15501 }
15502 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15503 // Ensure we didn't get back the shuffle we started with.
15504 // FIXME: This is a hack to make up for some splat handling code in
15505 // getVectorShuffle.
15506 if (isa<ShuffleVectorSDNode>(NewV1) &&
15507 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15508 return SDValue();
15509
15510 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15511 int Src = LaneSrcs[Lane][1];
15512 for (int i = 0; i != NumLaneElts; ++i) {
15513 int M = -1;
15514 if (Src >= 0)
15515 M = Src * NumLaneElts + i;
15516 NewMask[Lane * NumLaneElts + i] = M;
15517 }
15518 }
15519 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15520 // Ensure we didn't get back the shuffle we started with.
15521 // FIXME: This is a hack to make up for some splat handling code in
15522 // getVectorShuffle.
15523 if (isa<ShuffleVectorSDNode>(NewV2) &&
15524 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15525 return SDValue();
15526
15527 for (int i = 0; i != NumElts; ++i) {
15528 NewMask[i] = RepeatMask[i % NumLaneElts];
15529 if (NewMask[i] < 0)
15530 continue;
15531
15532 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15533 }
15534 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15535}
15536
15537/// If the input shuffle mask results in a vector that is undefined in all upper
15538/// or lower half elements and that mask accesses only 2 halves of the
15539/// shuffle's operands, return true. A mask of half the width with mask indexes
15540/// adjusted to access the extracted halves of the original shuffle operands is
15541/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15542/// lower half of each input operand is accessed.
15543static bool
15544getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
15545 int &HalfIdx1, int &HalfIdx2) {
15546 assert((Mask.size() == HalfMask.size() * 2) &&(((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"
) ? static_cast<void> (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15547, __PRETTY_FUNCTION__))
15547 "Expected input mask to be twice as long as output")(((Mask.size() == HalfMask.size() * 2) && "Expected input mask to be twice as long as output"
) ? static_cast<void> (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15547, __PRETTY_FUNCTION__))
;
15548
15549 // Exactly one half of the result must be undef to allow narrowing.
15550 bool UndefLower = isUndefLowerHalf(Mask);
15551 bool UndefUpper = isUndefUpperHalf(Mask);
15552 if (UndefLower == UndefUpper)
15553 return false;
15554
15555 unsigned HalfNumElts = HalfMask.size();
15556 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15557 HalfIdx1 = -1;
15558 HalfIdx2 = -1;
15559 for (unsigned i = 0; i != HalfNumElts; ++i) {
15560 int M = Mask[i + MaskIndexOffset];
15561 if (M < 0) {
15562 HalfMask[i] = M;
15563 continue;
15564 }
15565
15566 // Determine which of the 4 half vectors this element is from.
15567 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15568 int HalfIdx = M / HalfNumElts;
15569
15570 // Determine the element index into its half vector source.
15571 int HalfElt = M % HalfNumElts;
15572
15573 // We can shuffle with up to 2 half vectors, set the new 'half'
15574 // shuffle mask accordingly.
15575 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15576 HalfMask[i] = HalfElt;
15577 HalfIdx1 = HalfIdx;
15578 continue;
15579 }
15580 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15581 HalfMask[i] = HalfElt + HalfNumElts;
15582 HalfIdx2 = HalfIdx;
15583 continue;
15584 }
15585
15586 // Too many half vectors referenced.
15587 return false;
15588 }
15589
15590 return true;
15591}
15592
15593/// Given the output values from getHalfShuffleMask(), create a half width
15594/// shuffle of extracted vectors followed by an insert back to full width.
15595static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
15596 ArrayRef<int> HalfMask, int HalfIdx1,
15597 int HalfIdx2, bool UndefLower,
15598 SelectionDAG &DAG, bool UseConcat = false) {
15599 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")((V1.getValueType() == V2.getValueType() && "Different sized vectors?"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15599, __PRETTY_FUNCTION__))
;
15600 assert(V1.getValueType().isSimple() && "Expecting only simple types")((V1.getValueType().isSimple() && "Expecting only simple types"
) ? static_cast<void> (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15600, __PRETTY_FUNCTION__))
;
15601
15602 MVT VT = V1.getSimpleValueType();
15603 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15604 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15605
15606 auto getHalfVector = [&](int HalfIdx) {
15607 if (HalfIdx < 0)
15608 return DAG.getUNDEF(HalfVT);
15609 SDValue V = (HalfIdx < 2 ? V1 : V2);
15610 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15611 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15612 DAG.getIntPtrConstant(HalfIdx, DL));
15613 };
15614
15615 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15616 SDValue Half1 = getHalfVector(HalfIdx1);
15617 SDValue Half2 = getHalfVector(HalfIdx2);
15618 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15619 if (UseConcat) {
15620 SDValue Op0 = V;
15621 SDValue Op1 = DAG.getUNDEF(HalfVT);
15622 if (UndefLower)
15623 std::swap(Op0, Op1);
15624 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15625 }
15626
15627 unsigned Offset = UndefLower ? HalfNumElts : 0;
15628 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15629 DAG.getIntPtrConstant(Offset, DL));
15630}
15631
15632/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15633/// This allows for fast cases such as subvector extraction/insertion
15634/// or shuffling smaller vector types which can lower more efficiently.
15635static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,
15636 SDValue V2, ArrayRef<int> Mask,
15637 const X86Subtarget &Subtarget,
15638 SelectionDAG &DAG) {
15639 assert((VT.is256BitVector() || VT.is512BitVector()) &&(((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15640, __PRETTY_FUNCTION__))
15640 "Expected 256-bit or 512-bit vector")(((VT.is256BitVector() || VT.is512BitVector()) && "Expected 256-bit or 512-bit vector"
) ? static_cast<void> (0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15640, __PRETTY_FUNCTION__))
;
15641
15642 bool UndefLower = isUndefLowerHalf(Mask);
15643 if (!UndefLower && !isUndefUpperHalf(Mask))
15644 return SDValue();
15645
15646 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? static_cast<void> (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15647, __PRETTY_FUNCTION__))
15647 "Completely undef shuffle mask should have been simplified already")(((!UndefLower || !isUndefUpperHalf(Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? static_cast<void> (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15647, __PRETTY_FUNCTION__))
;
15648
15649 // Upper half is undef and lower half is whole upper subvector.
15650 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15651 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15652 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15653 if (!UndefLower &&
15654 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15655 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15656 DAG.getIntPtrConstant(HalfNumElts, DL));
15657 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15658 DAG.getIntPtrConstant(0, DL));
15659 }
15660
15661 // Lower half is undef and upper half is whole lower subvector.
15662 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15663 if (UndefLower &&
15664 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15665 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15666 DAG.getIntPtrConstant(0, DL));
15667 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15668 DAG.getIntPtrConstant(HalfNumElts, DL));
15669 }
15670
15671 int HalfIdx1, HalfIdx2;
15672 SmallVector<int, 8> HalfMask(HalfNumElts);
15673 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15674 return SDValue();
15675
15676 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")((HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"
) ? static_cast<void> (0) : __assert_fail ("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15676, __PRETTY_FUNCTION__))
;
15677
15678 // Only shuffle the halves of the inputs when useful.
15679 unsigned NumLowerHalves =
15680 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15681 unsigned NumUpperHalves =
15682 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15683 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")((NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed"
) ? static_cast<void> (0) : __assert_fail ("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15683, __PRETTY_FUNCTION__))
;
15684
15685 // Determine the larger pattern of undef/halves, then decide if it's worth
15686 // splitting the shuffle based on subtarget capabilities and types.
15687 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15688 if (!UndefLower) {
15689 // XXXXuuuu: no insert is needed.
15690 // Always extract lowers when setting lower - these are all free subreg ops.
15691 if (NumUpperHalves == 0)
15692 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15693 UndefLower, DAG);
15694
15695 if (NumUpperHalves == 1) {
15696 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15697 if (Subtarget.hasAVX2()) {
15698 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15699 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15700 !is128BitUnpackShuffleMask(HalfMask) &&
15701 (!isSingleSHUFPSMask(HalfMask) ||
15702 Subtarget.hasFastVariableShuffle()))
15703 return SDValue();
15704 // If this is a unary shuffle (assume that the 2nd operand is
15705 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15706 // are better off extracting the upper half of 1 operand and using a
15707 // narrow shuffle.
15708 if (EltWidth == 64 && V2.isUndef())
15709 return SDValue();
15710 }
15711 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15712 if (Subtarget.hasAVX512() && VT.is512BitVector())
15713 return SDValue();
15714 // Extract + narrow shuffle is better than the wide alternative.
15715 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15716 UndefLower, DAG);
15717 }
15718
15719 // Don't extract both uppers, instead shuffle and then extract.
15720 assert(NumUpperHalves == 2 && "Half vector count went wrong")((NumUpperHalves == 2 && "Half vector count went wrong"
) ? static_cast<void> (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15720, __PRETTY_FUNCTION__))
;
15721 return SDValue();
15722 }
15723
15724 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15725 if (NumUpperHalves == 0) {
15726 // AVX2 has efficient 64-bit element cross-lane shuffles.
15727 // TODO: Refine to account for unary shuffle, splat, and other masks?
15728 if (Subtarget.hasAVX2() && EltWidth == 64)
15729 return SDValue();
15730 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15731 if (Subtarget.hasAVX512() && VT.is512BitVector())
15732 return SDValue();
15733 // Narrow shuffle + insert is better than the wide alternative.
15734 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15735 UndefLower, DAG);
15736 }
15737
15738 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15739 return SDValue();
15740}
15741
15742/// Test whether the specified input (0 or 1) is in-place blended by the
15743/// given mask.
15744///
15745/// This returns true if the elements from a particular input are already in the
15746/// slot required by the given mask and require no permutation.
15747static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
15748 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(((Input == 0 || Input == 1) && "Only two inputs to shuffles."
) ? static_cast<void> (0) : __assert_fail ("(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15748, __PRETTY_FUNCTION__))
;
15749 int Size = Mask.size();
15750 for (int i = 0; i < Size; ++i)
15751 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
15752 return false;
15753
15754 return true;
15755}
15756
15757/// Handle case where shuffle sources are coming from the same 128-bit lane and
15758/// every lane can be represented as the same repeating mask - allowing us to
15759/// shuffle the sources with the repeating shuffle and then permute the result
15760/// to the destination lanes.
15761static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
15762 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15763 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15764 int NumElts = VT.getVectorNumElements();
15765 int NumLanes = VT.getSizeInBits() / 128;
15766 int NumLaneElts = NumElts / NumLanes;
15767
15768 // On AVX2 we may be able to just shuffle the lowest elements and then
15769 // broadcast the result.
15770 if (Subtarget.hasAVX2()) {
15771 for (unsigned BroadcastSize : {16, 32, 64}) {
15772 if (BroadcastSize <= VT.getScalarSizeInBits())
15773 continue;
15774 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15775
15776 // Attempt to match a repeating pattern every NumBroadcastElts,
15777 // accounting for UNDEFs but only references the lowest 128-bit
15778 // lane of the inputs.
15779 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15780 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15781 for (int j = 0; j != NumBroadcastElts; ++j) {
15782 int M = Mask[i + j];
15783 if (M < 0)
15784 continue;
15785 int &R = RepeatMask[j];
15786 if (0 != ((M % NumElts) / NumLaneElts))
15787 return false;
15788 if (0 <= R && R != M)
15789 return false;
15790 R = M;
15791 }
15792 return true;
15793 };
15794
15795 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15796 if (!FindRepeatingBroadcastMask(RepeatMask))
15797 continue;
15798
15799 // Shuffle the (lowest) repeated elements in place for broadcast.
15800 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15801
15802 // Shuffle the actual broadcast.
15803 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15804 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15805 for (int j = 0; j != NumBroadcastElts; ++j)
15806 BroadcastMask[i + j] = j;
15807 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15808 BroadcastMask);
15809 }
15810 }
15811
15812 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15813 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15814 return SDValue();
15815
15816 // Bail if we already have a repeated lane shuffle mask.
15817 SmallVector<int, 8> RepeatedShuffleMask;
15818 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
15819 return SDValue();
15820
15821 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15822 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
15823 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
15824 int NumSubLanes = NumLanes * SubLaneScale;
15825 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15826
15827 // Check that all the sources are coming from the same lane and see if we can
15828 // form a repeating shuffle mask (local to each sub-lane). At the same time,
15829 // determine the source sub-lane for each destination sub-lane.
15830 int TopSrcSubLane = -1;
15831 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15832 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
15833 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
15834 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
15835
15836 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15837 // Extract the sub-lane mask, check that it all comes from the same lane
15838 // and normalize the mask entries to come from the first lane.
15839 int SrcLane = -1;
15840 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15841 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15842 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15843 if (M < 0)
15844 continue;
15845 int Lane = (M % NumElts) / NumLaneElts;
15846 if ((0 <= SrcLane) && (SrcLane != Lane))
15847 return SDValue();
15848 SrcLane = Lane;
15849 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15850 SubLaneMask[Elt] = LocalM;
15851 }
15852
15853 // Whole sub-lane is UNDEF.
15854 if (SrcLane < 0)
15855 continue;
15856
15857 // Attempt to match against the candidate repeated sub-lane masks.
15858 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15859 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15860 for (int i = 0; i != NumSubLaneElts; ++i) {
15861 if (M1[i] < 0 || M2[i] < 0)
15862 continue;
15863 if (M1[i] != M2[i])
15864 return false;
15865 }
15866 return true;
15867 };
15868
15869 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15870 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15871 continue;
15872
15873 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15874 for (int i = 0; i != NumSubLaneElts; ++i) {
15875 int M = SubLaneMask[i];
15876 if (M < 0)
15877 continue;
15878 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] ==
M) && "Unexpected mask element") ? static_cast<void
> (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15879, __PRETTY_FUNCTION__))
15879 "Unexpected mask element")(((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] ==
M) && "Unexpected mask element") ? static_cast<void
> (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15879, __PRETTY_FUNCTION__))
;
15880 RepeatedSubLaneMask[i] = M;
15881 }
15882
15883 // Track the top most source sub-lane - by setting the remaining to UNDEF
15884 // we can greatly simplify shuffle matching.
15885 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15886 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15887 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15888 break;
15889 }
15890
15891 // Bail if we failed to find a matching repeated sub-lane mask.
15892 if (Dst2SrcSubLanes[DstSubLane] < 0)
15893 return SDValue();
15894 }
15895 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes
&& "Unexpected source lane") ? static_cast<void>
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15896, __PRETTY_FUNCTION__))
15896 "Unexpected source lane")((0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes
&& "Unexpected source lane") ? static_cast<void>
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15896, __PRETTY_FUNCTION__))
;
15897
15898 // Create a repeating shuffle mask for the entire vector.
15899 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15900 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15901 int Lane = SubLane / SubLaneScale;
15902 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15903 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15904 int M = RepeatedSubLaneMask[Elt];
15905 if (M < 0)
15906 continue;
15907 int Idx = (SubLane * NumSubLaneElts) + Elt;
15908 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15909 }
15910 }
15911 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15912
15913 // Shuffle each source sub-lane to its destination.
15914 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15915 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15916 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15917 if (SrcSubLane < 0)
15918 continue;
15919 for (int j = 0; j != NumSubLaneElts; ++j)
15920 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15921 }
15922
15923 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15924 SubLaneMask);
15925}
15926
15927static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
15928 bool &ForceV1Zero, bool &ForceV2Zero,
15929 unsigned &ShuffleImm, ArrayRef<int> Mask,
15930 const APInt &Zeroable) {
15931 int NumElts = VT.getVectorNumElements();
15932 assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15934, __PRETTY_FUNCTION__))
15933 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15934, __PRETTY_FUNCTION__))
15934 "Unexpected data type for VSHUFPD")((VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts
== 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15934, __PRETTY_FUNCTION__))
;
15935 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&((isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15936, __PRETTY_FUNCTION__))
15936 "Illegal shuffle mask")((isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask"
) ? static_cast<void> (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15936, __PRETTY_FUNCTION__))
;
15937
15938 bool ZeroLane[2] = { true, true };
15939 for (int i = 0; i < NumElts; ++i)
15940 ZeroLane[i & 1] &= Zeroable[i];
15941
15942 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15943 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15944 ShuffleImm = 0;
15945 bool ShufpdMask = true;
15946 bool CommutableMask = true;
15947 for (int i = 0; i < NumElts; ++i) {
15948 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15949 continue;
15950 if (Mask[i] < 0)
15951 return false;
15952 int Val = (i & 6) + NumElts * (i & 1);
15953 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15954 if (Mask[i] < Val || Mask[i] > Val + 1)
15955 ShufpdMask = false;
15956 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15957 CommutableMask = false;
15958 ShuffleImm |= (Mask[i] % 2) << i;
15959 }
15960
15961 if (!ShufpdMask && !CommutableMask)
15962 return false;
15963
15964 if (!ShufpdMask && CommutableMask)
15965 std::swap(V1, V2);
15966
15967 ForceV1Zero = ZeroLane[0];
15968 ForceV2Zero = ZeroLane[1];
15969 return true;
15970}
15971
15972static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
15973 SDValue V2, ArrayRef<int> Mask,
15974 const APInt &Zeroable,
15975 const X86Subtarget &Subtarget,
15976 SelectionDAG &DAG) {
15977 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD") ? static_cast<void>
(0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15978, __PRETTY_FUNCTION__))
15978 "Unexpected data type for VSHUFPD")(((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
"Unexpected data type for VSHUFPD") ? static_cast<void>
(0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 15978, __PRETTY_FUNCTION__))
;
15979
15980 unsigned Immediate = 0;
15981 bool ForceV1Zero = false, ForceV2Zero = false;
15982 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15983 Mask, Zeroable))
15984 return SDValue();
15985
15986 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15987 if (ForceV1Zero)
15988 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15989 if (ForceV2Zero)
15990 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15991
15992 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15993 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15994}
15995
15996// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15997// by zeroable elements in the remaining 24 elements. Turn this into two
15998// vmovqb instructions shuffled together.
15999static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
16000 SDValue V1, SDValue V2,
16001 ArrayRef<int> Mask,
16002 const APInt &Zeroable,
16003 SelectionDAG &DAG) {
16004 assert(VT == MVT::v32i8 && "Unexpected type!")((VT == MVT::v32i8 && "Unexpected type!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16004, __PRETTY_FUNCTION__))
;
16005
16006 // The first 8 indices should be every 8th element.
16007 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16008 return SDValue();
16009
16010 // Remaining elements need to be zeroable.
16011 if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
16012 return SDValue();
16013
16014 V1 = DAG.getBitcast(MVT::v4i64, V1);
16015 V2 = DAG.getBitcast(MVT::v4i64, V2);
16016
16017 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16018 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16019
16020 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16021 // the upper bits of the result using an unpckldq.
16022 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16023 { 0, 1, 2, 3, 16, 17, 18, 19,
16024 4, 5, 6, 7, 20, 21, 22, 23 });
16025 // Insert the unpckldq into a zero vector to widen to v32i8.
16026 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16027 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16028 DAG.getIntPtrConstant(0, DL));
16029}
16030
16031
16032/// Handle lowering of 4-lane 64-bit floating point shuffles.
16033///
16034/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16035/// isn't available.
16036static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16037 const APInt &Zeroable, SDValue V1, SDValue V2,
16038 const X86Subtarget &Subtarget,
16039 SelectionDAG &DAG) {
16040 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16040, __PRETTY_FUNCTION__))
;
16041 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16041, __PRETTY_FUNCTION__))
;
16042 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16042, __PRETTY_FUNCTION__))
;
16043
16044 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16045 Subtarget, DAG))
16046 return V;
16047
16048 if (V2.isUndef()) {
16049 // Check for being able to broadcast a single element.
16050 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16051 Mask, Subtarget, DAG))
16052 return Broadcast;
16053
16054 // Use low duplicate instructions for masks that match their pattern.
16055 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
16056 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16057
16058 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16059 // Non-half-crossing single input shuffles can be lowered with an
16060 // interleaved permutation.
16061 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16062 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16063 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16064 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16065 }
16066
16067 // With AVX2 we have direct support for this permutation.
16068 if (Subtarget.hasAVX2())
16069 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16070 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16071
16072 // Try to create an in-lane repeating shuffle mask and then shuffle the
16073 // results into the target lanes.
16074 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16075 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16076 return V;
16077
16078 // Try to permute the lanes and then use a per-lane permute.
16079 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16080 Mask, DAG, Subtarget))
16081 return V;
16082
16083 // Otherwise, fall back.
16084 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16085 DAG, Subtarget);
16086 }
16087
16088 // Use dedicated unpack instructions for masks that match their pattern.
16089 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
16090 return V;
16091
16092 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16093 Zeroable, Subtarget, DAG))
16094 return Blend;
16095
16096 // Check if the blend happens to exactly fit that of SHUFPD.
16097 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16098 Zeroable, Subtarget, DAG))
16099 return Op;
16100
16101 // If we have lane crossing shuffles AND they don't all come from the lower
16102 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16103 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16104 // canonicalize to a blend of splat which isn't necessary for this combine.
16105 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16106 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16107 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16108 (V2.getOpcode() != ISD::BUILD_VECTOR))
16109 if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
16110 Mask, DAG))
16111 return Op;
16112
16113 // If we have one input in place, then we can permute the other input and
16114 // blend the result.
16115 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16116 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
16117 Subtarget, DAG);
16118
16119 // Try to create an in-lane repeating shuffle mask and then shuffle the
16120 // results into the target lanes.
16121 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16122 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16123 return V;
16124
16125 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16126 // shuffle. However, if we have AVX2 and either inputs are already in place,
16127 // we will be able to shuffle even across lanes the other input in a single
16128 // instruction so skip this pattern.
16129 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
16130 isShuffleMaskInputInPlace(1, Mask))))
16131 if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(
16132 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16133 return V;
16134
16135 // If we have VLX support, we can use VEXPAND.
16136 if (Subtarget.hasVLX())
16137 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
16138 DAG, Subtarget))
16139 return V;
16140
16141 // If we have AVX2 then we always want to lower with a blend because an v4 we
16142 // can fully permute the elements.
16143 if (Subtarget.hasAVX2())
16144 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask,
16145 Subtarget, DAG);
16146
16147 // Otherwise fall back on generic lowering.
16148 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
16149 Subtarget, DAG);
16150}
16151
16152/// Handle lowering of 4-lane 64-bit integer shuffles.
16153///
16154/// This routine is only called when we have AVX2 and thus a reasonable
16155/// instruction set for v4i64 shuffling..
16156static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16157 const APInt &Zeroable, SDValue V1, SDValue V2,
16158 const X86Subtarget &Subtarget,
16159 SelectionDAG &DAG) {
16160 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16160, __PRETTY_FUNCTION__))
;
16161 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16161, __PRETTY_FUNCTION__))
;
16162 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")((Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16162, __PRETTY_FUNCTION__))
;
16163 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16163, __PRETTY_FUNCTION__))
;
16164
16165 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16166 Subtarget, DAG))
16167 return V;
16168
16169 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16170 Zeroable, Subtarget, DAG))
16171 return Blend;
16172
16173 // Check for being able to broadcast a single element.
16174 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16175 Subtarget, DAG))
16176 return Broadcast;
16177
16178 if (V2.isUndef()) {
16179 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16180 // can use lower latency instructions that will operate on both lanes.
16181 SmallVector<int, 2> RepeatedMask;
16182 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16183 SmallVector<int, 4> PSHUFDMask;
16184 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
16185 return DAG.getBitcast(
16186 MVT::v4i64,
16187 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16188 DAG.getBitcast(MVT::v8i32, V1),
16189 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16190 }
16191
16192 // AVX2 provides a direct instruction for permuting a single input across
16193 // lanes.
16194 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16195 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16196 }
16197
16198 // Try to use shift instructions.
16199 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
16200 Zeroable, Subtarget, DAG))
16201 return Shift;
16202
16203 // If we have VLX support, we can use VALIGN or VEXPAND.
16204 if (Subtarget.hasVLX()) {
16205 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16206 Subtarget, DAG))
16207 return Rotate;
16208
16209 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
16210 DAG, Subtarget))
16211 return V;
16212 }
16213
16214 // Try to use PALIGNR.
16215 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16216 Subtarget, DAG))
16217 return Rotate;
16218
16219 // Use dedicated unpack instructions for masks that match their pattern.
16220 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
16221 return V;
16222
16223 // If we have one input in place, then we can permute the other input and
16224 // blend the result.
16225 if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
16226 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
16227 Subtarget, DAG);
16228
16229 // Try to create an in-lane repeating shuffle mask and then shuffle the
16230 // results into the target lanes.
16231 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16232 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16233 return V;
16234
16235 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16236 // shuffle. However, if we have AVX2 and either inputs are already in place,
16237 // we will be able to shuffle even across lanes the other input in a single
16238 // instruction so skip this pattern.
16239 if (!isShuffleMaskInputInPlace(0, Mask) &&
16240 !isShuffleMaskInputInPlace(1, Mask))
16241 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16242 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16243 return Result;
16244
16245 // Otherwise fall back on generic blend lowering.
16246 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask,
16247 Subtarget, DAG);
16248}
16249
16250/// Handle lowering of 8-lane 32-bit floating point shuffles.
16251///
16252/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16253/// isn't available.
16254static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16255 const APInt &Zeroable, SDValue V1, SDValue V2,
16256 const X86Subtarget &Subtarget,
16257 SelectionDAG &DAG) {
16258 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16258, __PRETTY_FUNCTION__))
;
16259 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16259, __PRETTY_FUNCTION__))
;
16260 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16260, __PRETTY_FUNCTION__))
;
16261
16262 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16263 Zeroable, Subtarget, DAG))
16264 return Blend;
16265
16266 // Check for being able to broadcast a single element.
16267 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16268 Subtarget, DAG))
16269 return Broadcast;
16270
16271 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16272 // options to efficiently lower the shuffle.
16273 SmallVector<int, 4> RepeatedMask;
16274 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16275 assert(RepeatedMask.size() == 4 &&((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16276, __PRETTY_FUNCTION__))
16276 "Repeated masks must be half the mask width!")((RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16276, __PRETTY_FUNCTION__))
;
16277
16278 // Use even/odd duplicate instructions for masks that match their pattern.
16279 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
16280 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16281 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
16282 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16283
16284 if (V2.isUndef())
16285 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16286 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16287
16288 // Use dedicated unpack instructions for masks that match their pattern.
16289 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
16290 return V;
16291
16292 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16293 // have already handled any direct blends.
16294 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16295 }
16296
16297 // Try to create an in-lane repeating shuffle mask and then shuffle the
16298 // results into the target lanes.
16299 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16300 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16301 return V;
16302
16303 // If we have a single input shuffle with different shuffle patterns in the
16304 // two 128-bit lanes use the variable mask to VPERMILPS.
16305 if (V2.isUndef()) {
16306 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16307 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16308 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16309 }
16310 if (Subtarget.hasAVX2()) {
16311 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16312 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16313 }
16314 // Otherwise, fall back.
16315 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16316 DAG, Subtarget);
16317 }
16318
16319 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16320 // shuffle.
16321 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16322 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16323 return Result;
16324
16325 // If we have VLX support, we can use VEXPAND.
16326 if (Subtarget.hasVLX())
16327 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
16328 DAG, Subtarget))
16329 return V;
16330
16331 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16332 // since after split we get a more efficient code using vpunpcklwd and
16333 // vpunpckhwd instrs than vblend.
16334 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
16335 if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16336 Subtarget, DAG))
16337 return V;
16338
16339 // If we have AVX2 then we always want to lower with a blend because at v8 we
16340 // can fully permute the elements.
16341 if (Subtarget.hasAVX2())
16342 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask,
16343 Subtarget, DAG);
16344
16345 // Otherwise fall back on generic lowering.
16346 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16347 Subtarget, DAG);
16348}
16349
16350/// Handle lowering of 8-lane 32-bit integer shuffles.
16351///
16352/// This routine is only called when we have AVX2 and thus a reasonable
16353/// instruction set for v8i32 shuffling..
16354static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16355 const APInt &Zeroable, SDValue V1, SDValue V2,
16356 const X86Subtarget &Subtarget,
16357 SelectionDAG &DAG) {
16358 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16358, __PRETTY_FUNCTION__))
;
16359 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16359, __PRETTY_FUNCTION__))
;
16360 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16360, __PRETTY_FUNCTION__))
;
16361 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16361, __PRETTY_FUNCTION__))
;
16362
16363 // Whenever we can lower this as a zext, that instruction is strictly faster
16364 // than any alternative. It also allows us to fold memory operands into the
16365 // shuffle in many cases.
16366 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16367 Zeroable, Subtarget, DAG))
16368 return ZExt;
16369
16370 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16371 // since after split we get a more efficient code than vblend by using
16372 // vpunpcklwd and vpunpckhwd instrs.
16373 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
16374 !Subtarget.hasAVX512())
16375 if (SDValue V = lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask,
16376 Subtarget, DAG))
16377 return V;
16378
16379 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16380 Zeroable, Subtarget, DAG))
16381 return Blend;
16382
16383 // Check for being able to broadcast a single element.
16384 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16385 Subtarget, DAG))
16386 return Broadcast;
16387
16388 // If the shuffle mask is repeated in each 128-bit lane we can use more
16389 // efficient instructions that mirror the shuffles across the two 128-bit
16390 // lanes.
16391 SmallVector<int, 4> RepeatedMask;
16392 bool Is128BitLaneRepeatedShuffle =
16393 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16394 if (Is128BitLaneRepeatedShuffle) {
16395 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16395, __PRETTY_FUNCTION__))
;
16396 if (V2.isUndef())
16397 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16398 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16399
16400 // Use dedicated unpack instructions for masks that match their pattern.
16401 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
16402 return V;
16403 }
16404
16405 // Try to use shift instructions.
16406 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
16407 Zeroable, Subtarget, DAG))
16408 return Shift;
16409
16410 // If we have VLX support, we can use VALIGN or EXPAND.
16411 if (Subtarget.hasVLX()) {
16412 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16413 Subtarget, DAG))
16414 return Rotate;
16415
16416 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
16417 DAG, Subtarget))
16418 return V;
16419 }
16420
16421 // Try to use byte rotation instructions.
16422 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16423 Subtarget, DAG))
16424 return Rotate;
16425
16426 // Try to create an in-lane repeating shuffle mask and then shuffle the
16427 // results into the target lanes.
16428 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16429 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16430 return V;
16431
16432 if (V2.isUndef()) {
16433 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16434 // because that should be faster than the variable permute alternatives.
16435 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
16436 return V;
16437
16438 // If the shuffle patterns aren't repeated but it's a single input, directly
16439 // generate a cross-lane VPERMD instruction.
16440 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16441 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16442 }
16443
16444 // Assume that a single SHUFPS is faster than an alternative sequence of
16445 // multiple instructions (even if the CPU has a domain penalty).
16446 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16447 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16448 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16449 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16450 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16451 CastV1, CastV2, DAG);
16452 return DAG.getBitcast(MVT::v8i32, ShufPS);
16453 }
16454
16455 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16456 // shuffle.
16457 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16458 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16459 return Result;
16460
16461 // Otherwise fall back on generic blend lowering.
16462 return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask,
16463 Subtarget, DAG);
16464}
16465
16466/// Handle lowering of 16-lane 16-bit integer shuffles.
16467///
16468/// This routine is only called when we have AVX2 and thus a reasonable
16469/// instruction set for v16i16 shuffling..
16470static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16471 const APInt &Zeroable, SDValue V1, SDValue V2,
16472 const X86Subtarget &Subtarget,
16473 SelectionDAG &DAG) {
16474 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16474, __PRETTY_FUNCTION__))
;
16475 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16475, __PRETTY_FUNCTION__))
;
16476 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16476, __PRETTY_FUNCTION__))
;
16477 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16477, __PRETTY_FUNCTION__))
;
16478
16479 // Whenever we can lower this as a zext, that instruction is strictly faster
16480 // than any alternative. It also allows us to fold memory operands into the
16481 // shuffle in many cases.
16482 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
16483 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16484 return ZExt;
16485
16486 // Check for being able to broadcast a single element.
16487 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16488 Subtarget, DAG))
16489 return Broadcast;
16490
16491 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16492 Zeroable, Subtarget, DAG))
16493 return Blend;
16494
16495 // Use dedicated unpack instructions for masks that match their pattern.
16496 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16497 return V;
16498
16499 // Use dedicated pack instructions for masks that match their pattern.
16500 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16501 Subtarget))
16502 return V;
16503
16504 // Try to use shift instructions.
16505 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
16506 Zeroable, Subtarget, DAG))
16507 return Shift;
16508
16509 // Try to use byte rotation instructions.
16510 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16511 Subtarget, DAG))
16512 return Rotate;
16513
16514 // Try to create an in-lane repeating shuffle mask and then shuffle the
16515 // results into the target lanes.
16516 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16517 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16518 return V;
16519
16520 if (V2.isUndef()) {
16521 // Try to use bit rotation instructions.
16522 if (SDValue Rotate =
16523 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16524 return Rotate;
16525
16526 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16527 // because that should be faster than the variable permute alternatives.
16528 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
16529 return V;
16530
16531 // There are no generalized cross-lane shuffle operations available on i16
16532 // element types.
16533 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16534 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16535 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16536 return V;
16537
16538 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16539 DAG, Subtarget);
16540 }
16541
16542 SmallVector<int, 8> RepeatedMask;
16543 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16544 // As this is a single-input shuffle, the repeated mask should be
16545 // a strictly valid v8i16 mask that we can pass through to the v8i16
16546 // lowering to handle even the v16 case.
16547 return lowerV8I16GeneralSingleInputShuffle(
16548 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16549 }
16550 }
16551
16552 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16553 Zeroable, Subtarget, DAG))
16554 return PSHUFB;
16555
16556 // AVX512BWVL can lower to VPERMW.
16557 if (Subtarget.hasBWI() && Subtarget.hasVLX())
16558 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
16559
16560 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16561 // shuffle.
16562 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16563 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16564 return Result;
16565
16566 // Try to permute the lanes and then use a per-lane permute.
16567 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16568 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16569 return V;
16570
16571 // Otherwise fall back on generic lowering.
16572 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16573 Subtarget, DAG);
16574}
16575
16576/// Handle lowering of 32-lane 8-bit integer shuffles.
16577///
16578/// This routine is only called when we have AVX2 and thus a reasonable
16579/// instruction set for v32i8 shuffling..
16580static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16581 const APInt &Zeroable, SDValue V1, SDValue V2,
16582 const X86Subtarget &Subtarget,
16583 SelectionDAG &DAG) {
16584 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16584, __PRETTY_FUNCTION__))
;
16585 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16585, __PRETTY_FUNCTION__))
;
16586 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16586, __PRETTY_FUNCTION__))
;
16587 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")((Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16587, __PRETTY_FUNCTION__))
;
16588
16589 // Whenever we can lower this as a zext, that instruction is strictly faster
16590 // than any alternative. It also allows us to fold memory operands into the
16591 // shuffle in many cases.
16592 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16593 Zeroable, Subtarget, DAG))
16594 return ZExt;
16595
16596 // Check for being able to broadcast a single element.
16597 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16598 Subtarget, DAG))
16599 return Broadcast;
16600
16601 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16602 Zeroable, Subtarget, DAG))
16603 return Blend;
16604
16605 // Use dedicated unpack instructions for masks that match their pattern.
16606 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16607 return V;
16608
16609 // Use dedicated pack instructions for masks that match their pattern.
16610 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16611 Subtarget))
16612 return V;
16613
16614 // Try to use shift instructions.
16615 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
16616 Zeroable, Subtarget, DAG))
16617 return Shift;
16618
16619 // Try to use byte rotation instructions.
16620 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16621 Subtarget, DAG))
16622 return Rotate;
16623
16624 // Try to use bit rotation instructions.
16625 if (V2.isUndef())
16626 if (SDValue Rotate =
16627 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16628 return Rotate;
16629
16630 // Try to create an in-lane repeating shuffle mask and then shuffle the
16631 // results into the target lanes.
16632 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
16633 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16634 return V;
16635
16636 // There are no generalized cross-lane shuffle operations available on i8
16637 // element types.
16638 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16639 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16640 // because that should be faster than the variable permute alternatives.
16641 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
16642 return V;
16643
16644 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16645 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16646 return V;
16647
16648 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16649 DAG, Subtarget);
16650 }
16651
16652 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16653 Zeroable, Subtarget, DAG))
16654 return PSHUFB;
16655
16656 // AVX512VBMIVL can lower to VPERMB.
16657 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
16658 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
16659
16660 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16661 // shuffle.
16662 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
16663 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16664 return Result;
16665
16666 // Try to permute the lanes and then use a per-lane permute.
16667 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
16668 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16669 return V;
16670
16671 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16672 // by zeroable elements in the remaining 24 elements. Turn this into two
16673 // vmovqb instructions shuffled together.
16674 if (Subtarget.hasVLX())
16675 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16676 Mask, Zeroable, DAG))
16677 return V;
16678
16679 // Otherwise fall back on generic lowering.
16680 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16681 Subtarget, DAG);
16682}
16683
16684/// High-level routine to lower various 256-bit x86 vector shuffles.
16685///
16686/// This routine either breaks down the specific type of a 256-bit x86 vector
16687/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16688/// together based on the available instructions.
16689static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
16690 SDValue V1, SDValue V2, const APInt &Zeroable,
16691 const X86Subtarget &Subtarget,
16692 SelectionDAG &DAG) {
16693 // If we have a single input to the zero element, insert that into V1 if we
16694 // can do so cheaply.
16695 int NumElts = VT.getVectorNumElements();
16696 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16697
16698 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16699 if (SDValue Insertion = lowerShuffleAsElementInsertion(
16700 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16701 return Insertion;
16702
16703 // Handle special cases where the lower or upper half is UNDEF.
16704 if (SDValue V =
16705 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16706 return V;
16707
16708 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16709 // can check for those subtargets here and avoid much of the subtarget
16710 // querying in the per-vector-type lowering routines. With AVX1 we have
16711 // essentially *zero* ability to manipulate a 256-bit vector with integer
16712 // types. Since we'll use floating point types there eventually, just
16713 // immediately cast everything to a float and operate entirely in that domain.
16714 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16715 int ElementBits = VT.getScalarSizeInBits();
16716 if (ElementBits < 32) {
16717 // No floating point type available, if we can't use the bit operations
16718 // for masking/blending then decompose into 128-bit vectors.
16719 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16720 Subtarget, DAG))
16721 return V;
16722 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16723 return V;
16724 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
16725 }
16726
16727 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16728 VT.getVectorNumElements());
16729 V1 = DAG.getBitcast(FpVT, V1);
16730 V2 = DAG.getBitcast(FpVT, V2);
16731 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16732 }
16733
16734 switch (VT.SimpleTy) {
16735 case MVT::v4f64:
16736 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16737 case MVT::v4i64:
16738 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16739 case MVT::v8f32:
16740 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16741 case MVT::v8i32:
16742 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16743 case MVT::v16i16:
16744 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16745 case MVT::v32i8:
16746 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16747
16748 default:
16749 llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16749)
;
16750 }
16751}
16752
16753/// Try to lower a vector shuffle as a 128-bit shuffles.
16754static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
16755 const APInt &Zeroable, SDValue V1, SDValue V2,
16756 const X86Subtarget &Subtarget,
16757 SelectionDAG &DAG) {
16758 assert(VT.getScalarSizeInBits() == 64 &&((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16759, __PRETTY_FUNCTION__))
16759 "Unexpected element type size for 128bit shuffle.")((VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16759, __PRETTY_FUNCTION__))
;
16760
16761 // To handle 256 bit vector requires VLX and most probably
16762 // function lowerV2X128VectorShuffle() is better solution.
16763 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")((VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16763, __PRETTY_FUNCTION__))
;
16764
16765 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16766 SmallVector<int, 4> WidenedMask;
16767 if (!canWidenShuffleElements(Mask, WidenedMask))
16768 return SDValue();
16769
16770 // Try to use an insert into a zero vector.
16771 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16772 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16773 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16774 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16775 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16776 DAG.getIntPtrConstant(0, DL));
16777 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16778 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16779 DAG.getIntPtrConstant(0, DL));
16780 }
16781
16782 // Check for patterns which can be matched with a single insert of a 256-bit
16783 // subvector.
16784 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
16785 {0, 1, 2, 3, 0, 1, 2, 3});
16786 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
16787 {0, 1, 2, 3, 8, 9, 10, 11})) {
16788 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16789 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
16790 OnlyUsesV1 ? V1 : V2,
16791 DAG.getIntPtrConstant(0, DL));
16792 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16793 DAG.getIntPtrConstant(4, DL));
16794 }
16795
16796 assert(WidenedMask.size() == 4)((WidenedMask.size() == 4) ? static_cast<void> (0) : __assert_fail
("WidenedMask.size() == 4", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16796, __PRETTY_FUNCTION__))
;
16797
16798 // See if this is an insertion of the lower 128-bits of V2 into V1.
16799 bool IsInsert = true;
16800 int V2Index = -1;
16801 for (int i = 0; i < 4; ++i) {
16802 assert(WidenedMask[i] >= -1)((WidenedMask[i] >= -1) ? static_cast<void> (0) : __assert_fail
("WidenedMask[i] >= -1", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16802, __PRETTY_FUNCTION__))
;
16803 if (WidenedMask[i] < 0)
16804 continue;
16805
16806 // Make sure all V1 subvectors are in place.
16807 if (WidenedMask[i] < 4) {
16808 if (WidenedMask[i] != i) {
16809 IsInsert = false;
16810 break;
16811 }
16812 } else {
16813 // Make sure we only have a single V2 index and its the lowest 128-bits.
16814 if (V2Index >= 0 || WidenedMask[i] != 4) {
16815 IsInsert = false;
16816 break;
16817 }
16818 V2Index = i;
16819 }
16820 }
16821 if (IsInsert && V2Index >= 0) {
16822 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16823 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16824 DAG.getIntPtrConstant(0, DL));
16825 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16826 }
16827
16828 // Try to lower to vshuf64x2/vshuf32x4.
16829 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16830 unsigned PermMask = 0;
16831 // Insure elements came from the same Op.
16832 for (int i = 0; i < 4; ++i) {
16833 assert(WidenedMask[i] >= -1)((WidenedMask[i] >= -1) ? static_cast<void> (0) : __assert_fail
("WidenedMask[i] >= -1", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16833, __PRETTY_FUNCTION__))
;
16834 if (WidenedMask[i] < 0)
16835 continue;
16836
16837 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
16838 unsigned OpIndex = i / 2;
16839 if (Ops[OpIndex].isUndef())
16840 Ops[OpIndex] = Op;
16841 else if (Ops[OpIndex] != Op)
16842 return SDValue();
16843
16844 // Convert the 128-bit shuffle mask selection values into 128-bit selection
16845 // bits defined by a vshuf64x2 instruction's immediate control byte.
16846 PermMask |= (WidenedMask[i] % 4) << (i * 2);
16847 }
16848
16849 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16850 DAG.getTargetConstant(PermMask, DL, MVT::i8));
16851}
16852
16853/// Handle lowering of 8-lane 64-bit floating point shuffles.
16854static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16855 const APInt &Zeroable, SDValue V1, SDValue V2,
16856 const X86Subtarget &Subtarget,
16857 SelectionDAG &DAG) {
16858 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16858, __PRETTY_FUNCTION__))
;
16859 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16859, __PRETTY_FUNCTION__))
;
16860 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16860, __PRETTY_FUNCTION__))
;
16861
16862 if (V2.isUndef()) {
16863 // Use low duplicate instructions for masks that match their pattern.
16864 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
16865 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16866
16867 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16868 // Non-half-crossing single input shuffles can be lowered with an
16869 // interleaved permutation.
16870 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16871 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16872 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16873 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16874 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16875 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16876 }
16877
16878 SmallVector<int, 4> RepeatedMask;
16879 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16880 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16881 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16882 }
16883
16884 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16885 V2, Subtarget, DAG))
16886 return Shuf128;
16887
16888 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16889 return Unpck;
16890
16891 // Check if the blend happens to exactly fit that of SHUFPD.
16892 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16893 Zeroable, Subtarget, DAG))
16894 return Op;
16895
16896 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16897 DAG, Subtarget))
16898 return V;
16899
16900 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16901 Zeroable, Subtarget, DAG))
16902 return Blend;
16903
16904 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
16905}
16906
16907/// Handle lowering of 16-lane 32-bit floating point shuffles.
16908static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16909 const APInt &Zeroable, SDValue V1, SDValue V2,
16910 const X86Subtarget &Subtarget,
16911 SelectionDAG &DAG) {
16912 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16912, __PRETTY_FUNCTION__))
;
16913 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16913, __PRETTY_FUNCTION__))
;
16914 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16914, __PRETTY_FUNCTION__))
;
16915
16916 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16917 // options to efficiently lower the shuffle.
16918 SmallVector<int, 4> RepeatedMask;
16919 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16920 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16920, __PRETTY_FUNCTION__))
;
16921
16922 // Use even/odd duplicate instructions for masks that match their pattern.
16923 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
16924 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16925 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
16926 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16927
16928 if (V2.isUndef())
16929 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16930 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16931
16932 // Use dedicated unpack instructions for masks that match their pattern.
16933 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16934 return V;
16935
16936 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16937 Zeroable, Subtarget, DAG))
16938 return Blend;
16939
16940 // Otherwise, fall back to a SHUFPS sequence.
16941 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16942 }
16943
16944 // If we have a single input shuffle with different shuffle patterns in the
16945 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16946 if (V2.isUndef() &&
16947 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16948 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16949 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16950 }
16951
16952 // If we have AVX512F support, we can use VEXPAND.
16953 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16954 V1, V2, DAG, Subtarget))
16955 return V;
16956
16957 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
16958}
16959
16960/// Handle lowering of 8-lane 64-bit integer shuffles.
16961static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
16962 const APInt &Zeroable, SDValue V1, SDValue V2,
16963 const X86Subtarget &Subtarget,
16964 SelectionDAG &DAG) {
16965 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16965, __PRETTY_FUNCTION__))
;
16966 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16966, __PRETTY_FUNCTION__))
;
16967 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")((Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16967, __PRETTY_FUNCTION__))
;
16968
16969 if (V2.isUndef()) {
16970 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16971 // can use lower latency instructions that will operate on all four
16972 // 128-bit lanes.
16973 SmallVector<int, 2> Repeated128Mask;
16974 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16975 SmallVector<int, 4> PSHUFDMask;
16976 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
16977 return DAG.getBitcast(
16978 MVT::v8i64,
16979 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16980 DAG.getBitcast(MVT::v16i32, V1),
16981 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16982 }
16983
16984 SmallVector<int, 4> Repeated256Mask;
16985 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16986 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16987 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16988 }
16989
16990 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16991 V2, Subtarget, DAG))
16992 return Shuf128;
16993
16994 // Try to use shift instructions.
16995 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
16996 Zeroable, Subtarget, DAG))
16997 return Shift;
16998
16999 // Try to use VALIGN.
17000 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17001 Subtarget, DAG))
17002 return Rotate;
17003
17004 // Try to use PALIGNR.
17005 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17006 Subtarget, DAG))
17007 return Rotate;
17008
17009 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
17010 return Unpck;
17011 // If we have AVX512F support, we can use VEXPAND.
17012 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
17013 DAG, Subtarget))
17014 return V;
17015
17016 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17017 Zeroable, Subtarget, DAG))
17018 return Blend;
17019
17020 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
17021}
17022
17023/// Handle lowering of 16-lane 32-bit integer shuffles.
17024static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17025 const APInt &Zeroable, SDValue V1, SDValue V2,
17026 const X86Subtarget &Subtarget,
17027 SelectionDAG &DAG) {
17028 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17028, __PRETTY_FUNCTION__))
;
17029 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17029, __PRETTY_FUNCTION__))
;
17030 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")((Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17030, __PRETTY_FUNCTION__))
;
17031
17032 // Whenever we can lower this as a zext, that instruction is strictly faster
17033 // than any alternative. It also allows us to fold memory operands into the
17034 // shuffle in many cases.
17035 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17036 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17037 return ZExt;
17038
17039 // If the shuffle mask is repeated in each 128-bit lane we can use more
17040 // efficient instructions that mirror the shuffles across the four 128-bit
17041 // lanes.
17042 SmallVector<int, 4> RepeatedMask;
17043 bool Is128BitLaneRepeatedShuffle =
17044 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17045 if (Is128BitLaneRepeatedShuffle) {
17046 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")((RepeatedMask.size() == 4 && "Unexpected repeated mask size!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17046, __PRETTY_FUNCTION__))
;
17047 if (V2.isUndef())
17048 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17049 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17050
17051 // Use dedicated unpack instructions for masks that match their pattern.
17052 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
17053 return V;
17054 }
17055
17056 // Try to use shift instructions.
17057 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
17058 Zeroable, Subtarget, DAG))
17059 return Shift;
17060
17061 // Try to use VALIGN.
17062 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17063 Subtarget, DAG))
17064 return Rotate;
17065
17066 // Try to use byte rotation instructions.
17067 if (Subtarget.hasBWI())
17068 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17069 Subtarget, DAG))
17070 return Rotate;
17071
17072 // Assume that a single SHUFPS is faster than using a permv shuffle.
17073 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17074 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17075 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17076 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17077 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17078 CastV1, CastV2, DAG);
17079 return DAG.getBitcast(MVT::v16i32, ShufPS);
17080 }
17081 // If we have AVX512F support, we can use VEXPAND.
17082 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17083 DAG, Subtarget))
17084 return V;
17085
17086 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17087 Zeroable, Subtarget, DAG))
17088 return Blend;
17089 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
17090}
17091
17092/// Handle lowering of 32-lane 16-bit integer shuffles.
17093static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17094 const APInt &Zeroable, SDValue V1, SDValue V2,
17095 const X86Subtarget &Subtarget,
17096 SelectionDAG &DAG) {
17097 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17097, __PRETTY_FUNCTION__))
;
17098 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17098, __PRETTY_FUNCTION__))
;
17099 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")((Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17099, __PRETTY_FUNCTION__))
;
17100 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17100, __PRETTY_FUNCTION__))
;
17101
17102 // Whenever we can lower this as a zext, that instruction is strictly faster
17103 // than any alternative. It also allows us to fold memory operands into the
17104 // shuffle in many cases.
17105 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17106 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17107 return ZExt;
17108
17109 // Use dedicated unpack instructions for masks that match their pattern.
17110 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17111 return V;
17112
17113 // Try to use shift instructions.
17114 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
17115 Zeroable, Subtarget, DAG))
17116 return Shift;
17117
17118 // Try to use byte rotation instructions.
17119 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17120 Subtarget, DAG))
17121 return Rotate;
17122
17123 if (V2.isUndef()) {
17124 // Try to use bit rotation instructions.
17125 if (SDValue Rotate =
17126 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17127 return Rotate;
17128
17129 SmallVector<int, 8> RepeatedMask;
17130 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17131 // As this is a single-input shuffle, the repeated mask should be
17132 // a strictly valid v8i16 mask that we can pass through to the v8i16
17133 // lowering to handle even the v32 case.
17134 return lowerV8I16GeneralSingleInputShuffle(
17135 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
17136 }
17137 }
17138
17139 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17140 Zeroable, Subtarget, DAG))
17141 return Blend;
17142
17143 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17144 Zeroable, Subtarget, DAG))
17145 return PSHUFB;
17146
17147 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
17148}
17149
17150/// Handle lowering of 64-lane 8-bit integer shuffles.
17151static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
17152 const APInt &Zeroable, SDValue V1, SDValue V2,
17153 const X86Subtarget &Subtarget,
17154 SelectionDAG &DAG) {
17155 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17155, __PRETTY_FUNCTION__))
;
17156 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")((V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"
) ? static_cast<void> (0) : __assert_fail ("V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17156, __PRETTY_FUNCTION__))
;
17157 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")((Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? static_cast<void> (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17157, __PRETTY_FUNCTION__))
;
17158 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")((Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17158, __PRETTY_FUNCTION__))
;
17159
17160 // Whenever we can lower this as a zext, that instruction is strictly faster
17161 // than any alternative. It also allows us to fold memory operands into the
17162 // shuffle in many cases.
17163 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
17164 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17165 return ZExt;
17166
17167 // Use dedicated unpack instructions for masks that match their pattern.
17168 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17169 return V;
17170
17171 // Use dedicated pack instructions for masks that match their pattern.
17172 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17173 Subtarget))
17174 return V;
17175
17176 // Try to use shift instructions.
17177 if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
17178 Zeroable, Subtarget, DAG))
17179 return Shift;
17180
17181 // Try to use byte rotation instructions.
17182 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17183 Subtarget, DAG))
17184 return Rotate;
17185
17186 // Try to use bit rotation instructions.
17187 if (V2.isUndef())
17188 if (SDValue Rotate =
17189 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17190 return Rotate;
17191
17192 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17193 Zeroable, Subtarget, DAG))
17194 return PSHUFB;
17195
17196 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17197 if (Subtarget.hasVBMI())
17198 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
17199
17200 // Try to create an in-lane repeating shuffle mask and then shuffle the
17201 // results into the target lanes.
17202 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
17203 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17204 return V;
17205
17206 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17207 Zeroable, Subtarget, DAG))
17208 return Blend;
17209
17210 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17211 // shuffle.
17212 if (!V2.isUndef())
17213 if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(
17214 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17215 return Result;
17216
17217 // FIXME: Implement direct support for this type!
17218 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
17219}
17220
17221/// High-level routine to lower various 512-bit x86 vector shuffles.
17222///
17223/// This routine either breaks down the specific type of a 512-bit x86 vector
17224/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17225/// together based on the available instructions.
17226static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
17227 MVT VT, SDValue V1, SDValue V2,
17228 const APInt &Zeroable,
17229 const X86Subtarget &Subtarget,
17230 SelectionDAG &DAG) {
17231 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17232, __PRETTY_FUNCTION__))
17232 "Cannot lower 512-bit vectors w/ basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17232, __PRETTY_FUNCTION__))
;
17233
17234 // If we have a single input to the zero element, insert that into V1 if we
17235 // can do so cheaply.
17236 int NumElts = Mask.size();
17237 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17238
17239 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17240 if (SDValue Insertion = lowerShuffleAsElementInsertion(
17241 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17242 return Insertion;
17243
17244 // Handle special cases where the lower or upper half is UNDEF.
17245 if (SDValue V =
17246 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17247 return V;
17248
17249 // Check for being able to broadcast a single element.
17250 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17251 Subtarget, DAG))
17252 return Broadcast;
17253
17254 // Dispatch to each element type for lowering. If we don't have support for
17255 // specific element type shuffles at 512 bits, immediately split them and
17256 // lower them. Each lowering routine of a given type is allowed to assume that
17257 // the requisite ISA extensions for that element type are available.
17258 switch (VT.SimpleTy) {
17259 case MVT::v8f64:
17260 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17261 case MVT::v16f32:
17262 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17263 case MVT::v8i64:
17264 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17265 case MVT::v16i32:
17266 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17267 case MVT::v32i16:
17268 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17269 case MVT::v64i8:
17270 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17271
17272 default:
17273 llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17273)
;
17274 }
17275}
17276
17277static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,
17278 MVT VT, SDValue V1, SDValue V2,
17279 const X86Subtarget &Subtarget,
17280 SelectionDAG &DAG) {
17281 // Shuffle should be unary.
17282 if (!V2.isUndef())
17283 return SDValue();
17284
17285 int ShiftAmt = -1;
17286 int NumElts = Mask.size();
17287 for (int i = 0; i != NumElts; ++i) {
17288 int M = Mask[i];
17289 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(((M == SM_SentinelUndef || (0 <= M && M < NumElts
)) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17290, __PRETTY_FUNCTION__))
17290 "Unexpected mask index.")(((M == SM_SentinelUndef || (0 <= M && M < NumElts
)) && "Unexpected mask index.") ? static_cast<void
> (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17290, __PRETTY_FUNCTION__))
;
17291 if (M < 0)
17292 continue;
17293
17294 // The first non-undef element determines our shift amount.
17295 if (ShiftAmt < 0) {
17296 ShiftAmt = M - i;
17297 // Need to be shifting right.
17298 if (ShiftAmt <= 0)
17299 return SDValue();
17300 }
17301 // All non-undef elements must shift by the same amount.
17302 if (ShiftAmt != M - i)
17303 return SDValue();
17304 }
17305 assert(ShiftAmt >= 0 && "All undef?")((ShiftAmt >= 0 && "All undef?") ? static_cast<
void> (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17305, __PRETTY_FUNCTION__))
;
17306
17307 // Great we found a shift right.
17308 MVT WideVT = VT;
17309 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
17310 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
17311 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
17312 DAG.getUNDEF(WideVT), V1,
17313 DAG.getIntPtrConstant(0, DL));
17314 Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,
17315 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17316 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17317 DAG.getIntPtrConstant(0, DL));
17318}
17319
17320// Determine if this shuffle can be implemented with a KSHIFT instruction.
17321// Returns the shift amount if possible or -1 if not. This is a simplified
17322// version of matchShuffleAsShift.
17323static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17324 int MaskOffset, const APInt &Zeroable) {
17325 int Size = Mask.size();
17326
17327 auto CheckZeros = [&](int Shift, bool Left) {
17328 for (int j = 0; j < Shift; ++j)
17329 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17330 return false;
17331
17332 return true;
17333 };
17334
17335 auto MatchShift = [&](int Shift, bool Left) {
17336 unsigned Pos = Left ? Shift : 0;
17337 unsigned Low = Left ? 0 : Shift;
17338 unsigned Len = Size - Shift;
17339 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17340 };
17341
17342 for (int Shift = 1; Shift != Size; ++Shift)
17343 for (bool Left : {true, false})
17344 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17345 Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
17346 return Shift;
17347 }
17348
17349 return -1;
17350}
17351
17352
17353// Lower vXi1 vector shuffles.
17354// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17355// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17356// vector, shuffle and then truncate it back.
17357static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
17358 MVT VT, SDValue V1, SDValue V2,
17359 const APInt &Zeroable,
17360 const X86Subtarget &Subtarget,
17361 SelectionDAG &DAG) {
17362 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17363, __PRETTY_FUNCTION__))
17363 "Cannot lower 512-bit vectors w/o basic ISA!")((Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17363, __PRETTY_FUNCTION__))
;
17364
17365 int NumElts = Mask.size();
17366
17367 // Try to recognize shuffles that are just padding a subvector with zeros.
17368 int SubvecElts = 0;
17369 int Src = -1;
17370 for (int i = 0; i != NumElts; ++i) {
17371 if (Mask[i] >= 0) {
17372 // Grab the source from the first valid mask. All subsequent elements need
17373 // to use this same source.
17374 if (Src < 0)
17375 Src = Mask[i] / NumElts;
17376 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17377 break;
17378 }
17379
17380 ++SubvecElts;
17381 }
17382 assert(SubvecElts != NumElts && "Identity shuffle?")((SubvecElts != NumElts && "Identity shuffle?") ? static_cast
<void> (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17382, __PRETTY_FUNCTION__))
;
17383
17384 // Clip to a power 2.
17385 SubvecElts = PowerOf2Floor(SubvecElts);
17386
17387 // Make sure the number of zeroable bits in the top at least covers the bits
17388 // not covered by the subvector.
17389 if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
17390 assert(Src >= 0 && "Expected a source!")((Src >= 0 && "Expected a source!") ? static_cast<
void> (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17390, __PRETTY_FUNCTION__))
;
17391 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17392 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17393 Src == 0 ? V1 : V2,
17394 DAG.getIntPtrConstant(0, DL));
17395 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17396 DAG.getConstant(0, DL, VT),
17397 Extract, DAG.getIntPtrConstant(0, DL));
17398 }
17399
17400 // Try a simple shift right with undef elements. Later we'll try with zeros.
17401 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17402 DAG))
17403 return Shift;
17404
17405 // Try to match KSHIFTs.
17406 unsigned Offset = 0;
17407 for (SDValue V : { V1, V2 }) {
17408 unsigned Opcode;
17409 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17410 if (ShiftAmt >= 0) {
17411 MVT WideVT = VT;
17412 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
17413 WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
17414 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,
17415 DAG.getUNDEF(WideVT), V,
17416 DAG.getIntPtrConstant(0, DL));
17417 // Widened right shifts need two shifts to ensure we shift in zeroes.
17418 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17419 int WideElts = WideVT.getVectorNumElements();
17420 // Shift left to put the original vector in the MSBs of the new size.
17421 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17422 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17423 // Increase the shift amount to account for the left shift.
17424 ShiftAmt += WideElts - NumElts;
17425 }
17426
17427 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17428 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17429 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17430 DAG.getIntPtrConstant(0, DL));
17431 }
17432 Offset += NumElts; // Increment for next iteration.
17433 }
17434
17435
17436
17437 MVT ExtVT;
17438 switch (VT.SimpleTy) {
17439 default:
17440 llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17440)
;
17441 case MVT::v2i1:
17442 ExtVT = MVT::v2i64;
17443 break;
17444 case MVT::v4i1:
17445 ExtVT = MVT::v4i32;
17446 break;
17447 case MVT::v8i1:
17448 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17449 // shuffle.
17450 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17451 break;
17452 case MVT::v16i1:
17453 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17454 // 256-bit operation available.
17455 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17456 break;
17457 case MVT::v32i1:
17458 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17459 // 256-bit operation available.
17460 assert(Subtarget.hasBWI() && "Expected AVX512BW support")((Subtarget.hasBWI() && "Expected AVX512BW support") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17460, __PRETTY_FUNCTION__))
;
17461 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17462 break;
17463 case MVT::v64i1:
17464 // Fall back to scalarization. FIXME: We can do better if the shuffle
17465 // can be partitioned cleanly.
17466 if (!Subtarget.useBWIRegs())
17467 return SDValue();
17468 ExtVT = MVT::v64i8;
17469 break;
17470 }
17471
17472 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17473 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17474
17475 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17476 // i1 was sign extended we can use X86ISD::CVT2MASK.
17477 int NumElems = VT.getVectorNumElements();
17478 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17479 (Subtarget.hasDQI() && (NumElems < 32)))
17480 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17481 Shuffle, ISD::SETGT);
17482
17483 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17484}
17485
17486/// Helper function that returns true if the shuffle mask should be
17487/// commuted to improve canonicalization.
17488static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
17489 int NumElements = Mask.size();
17490
17491 int NumV1Elements = 0, NumV2Elements = 0;
17492 for (int M : Mask)
17493 if (M < 0)
17494 continue;
17495 else if (M < NumElements)
17496 ++NumV1Elements;
17497 else
17498 ++NumV2Elements;
17499
17500 // Commute the shuffle as needed such that more elements come from V1 than
17501 // V2. This allows us to match the shuffle pattern strictly on how many
17502 // elements come from V1 without handling the symmetric cases.
17503 if (NumV2Elements > NumV1Elements)
17504 return true;
17505
17506 assert(NumV1Elements > 0 && "No V1 indices")((NumV1Elements > 0 && "No V1 indices") ? static_cast
<void> (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17506, __PRETTY_FUNCTION__))
;
17507
17508 if (NumV2Elements == 0)
17509 return false;
17510
17511 // When the number of V1 and V2 elements are the same, try to minimize the
17512 // number of uses of V2 in the low half of the vector. When that is tied,
17513 // ensure that the sum of indices for V1 is equal to or lower than the sum
17514 // indices for V2. When those are equal, try to ensure that the number of odd
17515 // indices for V1 is lower than the number of odd indices for V2.
17516 if (NumV1Elements == NumV2Elements) {
17517 int LowV1Elements = 0, LowV2Elements = 0;
17518 for (int M : Mask.slice(0, NumElements / 2))
17519 if (M >= NumElements)
17520 ++LowV2Elements;
17521 else if (M >= 0)
17522 ++LowV1Elements;
17523 if (LowV2Elements > LowV1Elements)
17524 return true;
17525 if (LowV2Elements == LowV1Elements) {
17526 int SumV1Indices = 0, SumV2Indices = 0;
17527 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17528 if (Mask[i] >= NumElements)
17529 SumV2Indices += i;
17530 else if (Mask[i] >= 0)
17531 SumV1Indices += i;
17532 if (SumV2Indices < SumV1Indices)
17533 return true;
17534 if (SumV2Indices == SumV1Indices) {
17535 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17536 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17537 if (Mask[i] >= NumElements)
17538 NumV2OddIndices += i % 2;
17539 else if (Mask[i] >= 0)
17540 NumV1OddIndices += i % 2;
17541 if (NumV2OddIndices < NumV1OddIndices)
17542 return true;
17543 }
17544 }
17545 }
17546
17547 return false;
17548}
17549
17550/// Top-level lowering for x86 vector shuffles.
17551///
17552/// This handles decomposition, canonicalization, and lowering of all x86
17553/// vector shuffles. Most of the specific lowering strategies are encapsulated
17554/// above in helper routines. The canonicalization attempts to widen shuffles
17555/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17556/// s.t. only one of the two inputs needs to be tested, etc.
17557static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
17558 SelectionDAG &DAG) {
17559 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17560 ArrayRef<int> OrigMask = SVOp->getMask();
17561 SDValue V1 = Op.getOperand(0);
17562 SDValue V2 = Op.getOperand(1);
17563 MVT VT = Op.getSimpleValueType();
17564 int NumElements = VT.getVectorNumElements();
17565 SDLoc DL(Op);
17566 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17567
17568 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17569, __PRETTY_FUNCTION__))
17569 "Can't lower MMX shuffles")(((VT.getSizeInBits() != 64 || Is1BitVector) && "Can't lower MMX shuffles"
) ? static_cast<void> (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17569, __PRETTY_FUNCTION__))
;
17570
17571 bool V1IsUndef = V1.isUndef();
17572 bool V2IsUndef = V2.isUndef();
17573 if (V1IsUndef && V2IsUndef)
17574 return DAG.getUNDEF(VT);
17575
17576 // When we create a shuffle node we put the UNDEF node to second operand,
17577 // but in some cases the first operand may be transformed to UNDEF.
17578 // In this case we should just commute the node.
17579 if (V1IsUndef)
17580 return DAG.getCommutedVectorShuffle(*SVOp);
17581
17582 // Check for non-undef masks pointing at an undef vector and make the masks
17583 // undef as well. This makes it easier to match the shuffle based solely on
17584 // the mask.
17585 if (V2IsUndef &&
17586 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17587 SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
17588 for (int &M : NewMask)
17589 if (M >= NumElements)
17590 M = -1;
17591 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17592 }
17593
17594 // Check for illegal shuffle mask element index values.
17595 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17596 (void)MaskUpperLimit;
17597 assert(llvm::all_of(OrigMask,((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17599, __PRETTY_FUNCTION__))
17598 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17599, __PRETTY_FUNCTION__))
17599 "Out of bounds shuffle index")((llvm::all_of(OrigMask, [&](int M) { return -1 <= M &&
M < MaskUpperLimit; }) && "Out of bounds shuffle index"
) ? static_cast<void> (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17599, __PRETTY_FUNCTION__))
;
17600
17601 // We actually see shuffles that are entirely re-arrangements of a set of
17602 // zero inputs. This mostly happens while decomposing complex shuffles into
17603 // simple ones. Directly lower these as a buildvector of zeros.
17604 APInt KnownUndef, KnownZero;
17605 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17606
17607 APInt Zeroable = KnownUndef | KnownZero;
17608 if (Zeroable.isAllOnesValue())
17609 return getZeroVector(VT, Subtarget, DAG, DL);
17610
17611 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17612
17613 // Try to collapse shuffles into using a vector type with fewer elements but
17614 // wider element types. We cap this to not form integers or floating point
17615 // elements wider than 64 bits, but it might be interesting to form i128
17616 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17617 SmallVector<int, 16> WidenedMask;
17618 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17619 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17620 // Shuffle mask widening should not interfere with a broadcast opportunity
17621 // by obfuscating the operands with bitcasts.
17622 // TODO: Avoid lowering directly from this top-level function: make this
17623 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17624 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17625 Subtarget, DAG))
17626 return Broadcast;
17627
17628 MVT NewEltVT = VT.isFloatingPoint()
17629 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
17630 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
17631 int NewNumElts = NumElements / 2;
17632 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17633 // Make sure that the new vector type is legal. For example, v2f64 isn't
17634 // legal on SSE1.
17635 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17636 if (V2IsZero) {
17637 // Modify the new Mask to take all zeros from the all-zero vector.
17638 // Choose indices that are blend-friendly.
17639 bool UsedZeroVector = false;
17640 assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&((find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!") ? static_cast<void>
(0) : __assert_fail ("find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17641, __PRETTY_FUNCTION__))
17641 "V2's non-undef elements are used?!")((find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
"V2's non-undef elements are used?!") ? static_cast<void>
(0) : __assert_fail ("find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && \"V2's non-undef elements are used?!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17641, __PRETTY_FUNCTION__))
;
17642 for (int i = 0; i != NewNumElts; ++i)
17643 if (WidenedMask[i] == SM_SentinelZero) {
17644 WidenedMask[i] = i + NewNumElts;
17645 UsedZeroVector = true;
17646 }
17647 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17648 // some elements to be undef.
17649 if (UsedZeroVector)
17650 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17651 }
17652 V1 = DAG.getBitcast(NewVT, V1);
17653 V2 = DAG.getBitcast(NewVT, V2);
17654 return DAG.getBitcast(
17655 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17656 }
17657 }
17658
17659 // Commute the shuffle if it will improve canonicalization.
17660 SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
17661 if (canonicalizeShuffleMaskWithCommute(Mask)) {
17662 ShuffleVectorSDNode::commuteMask(Mask);
17663 std::swap(V1, V2);
17664 }
17665
17666 if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
17667 return V;
17668
17669 // For each vector width, delegate to a specialized lowering routine.
17670 if (VT.is128BitVector())
17671 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17672
17673 if (VT.is256BitVector())
17674 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17675
17676 if (VT.is512BitVector())
17677 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17678
17679 if (Is1BitVector)
17680 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17681
17682 llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17682)
;
17683}
17684
17685/// Try to lower a VSELECT instruction to a vector shuffle.
17686static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
17687 const X86Subtarget &Subtarget,
17688 SelectionDAG &DAG) {
17689 SDValue Cond = Op.getOperand(0);
17690 SDValue LHS = Op.getOperand(1);
17691 SDValue RHS = Op.getOperand(2);
17692 MVT VT = Op.getSimpleValueType();
17693
17694 // Only non-legal VSELECTs reach this lowering, convert those into generic
17695 // shuffles and re-use the shuffle lowering path for blends.
17696 SmallVector<int, 32> Mask;
17697 if (createShuffleMaskFromVSELECT(Mask, Cond))
17698 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17699
17700 return SDValue();
17701}
17702
17703SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17704 SDValue Cond = Op.getOperand(0);
17705 SDValue LHS = Op.getOperand(1);
17706 SDValue RHS = Op.getOperand(2);
17707
17708 // A vselect where all conditions and data are constants can be optimized into
17709 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17710 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
17711 ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
17712 ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
17713 return SDValue();
17714
17715 // Try to lower this to a blend-style vector shuffle. This can handle all
17716 // constant condition cases.
17717 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17718 return BlendOp;
17719
17720 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17721 // with patterns on the mask registers on AVX-512.
17722 MVT CondVT = Cond.getSimpleValueType();
17723 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17724 if (CondEltSize == 1)
17725 return Op;
17726
17727 // Variable blends are only legal from SSE4.1 onward.
17728 if (!Subtarget.hasSSE41())
17729 return SDValue();
17730
17731 SDLoc dl(Op);
17732 MVT VT = Op.getSimpleValueType();
17733 unsigned EltSize = VT.getScalarSizeInBits();
17734 unsigned NumElts = VT.getVectorNumElements();
17735
17736 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17737 // into an i1 condition so that we can use the mask-based 512-bit blend
17738 // instructions.
17739 if (VT.getSizeInBits() == 512) {
17740 // Build a mask by testing the condition against zero.
17741 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17742 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17743 DAG.getConstant(0, dl, CondVT),
17744 ISD::SETNE);
17745 // Now return a new VSELECT using the mask.
17746 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17747 }
17748
17749 // SEXT/TRUNC cases where the mask doesn't match the destination size.
17750 if (CondEltSize != EltSize) {
17751 // If we don't have a sign splat, rely on the expansion.
17752 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17753 return SDValue();
17754
17755 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17756 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17757 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17758 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17759 }
17760
17761 // Only some types will be legal on some subtargets. If we can emit a legal
17762 // VSELECT-matching blend, return Op, and but if we need to expand, return
17763 // a null value.
17764 switch (VT.SimpleTy) {
17765 default:
17766 // Most of the vector types have blends past SSE4.1.
17767 return Op;
17768
17769 case MVT::v32i8:
17770 // The byte blends for AVX vectors were introduced only in AVX2.
17771 if (Subtarget.hasAVX2())
17772 return Op;
17773
17774 return SDValue();
17775
17776 case MVT::v8i16:
17777 case MVT::v16i16: {
17778 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17779 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17780 Cond = DAG.getBitcast(CastVT, Cond);
17781 LHS = DAG.getBitcast(CastVT, LHS);
17782 RHS = DAG.getBitcast(CastVT, RHS);
17783 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17784 return DAG.getBitcast(VT, Select);
17785 }
17786 }
17787}
17788
17789static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
17790 MVT VT = Op.getSimpleValueType();
17791 SDLoc dl(Op);
17792
17793 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
17794 return SDValue();
17795
17796 if (VT.getSizeInBits() == 8) {
17797 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
17798 Op.getOperand(0), Op.getOperand(1));
17799 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17800 }
17801
17802 if (VT == MVT::f32) {
17803 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17804 // the result back to FR32 register. It's only worth matching if the
17805 // result has a single use which is a store or a bitcast to i32. And in
17806 // the case of a store, it's not worth it if the index is a constant 0,
17807 // because a MOVSSmr can be used instead, which is smaller and faster.
17808 if (!Op.hasOneUse())
17809 return SDValue();
17810 SDNode *User = *Op.getNode()->use_begin();
17811 if ((User->getOpcode() != ISD::STORE ||
17812 isNullConstant(Op.getOperand(1))) &&
17813 (User->getOpcode() != ISD::BITCAST ||
17814 User->getValueType(0) != MVT::i32))
17815 return SDValue();
17816 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17817 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
17818 Op.getOperand(1));
17819 return DAG.getBitcast(MVT::f32, Extract);
17820 }
17821
17822 if (VT == MVT::i32 || VT == MVT::i64) {
17823 // ExtractPS/pextrq works with constant index.
17824 if (isa<ConstantSDNode>(Op.getOperand(1)))
17825 return Op;
17826 }
17827
17828 return SDValue();
17829}
17830
17831/// Extract one bit from mask vector, like v16i1 or v8i1.
17832/// AVX-512 feature.
17833static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
17834 const X86Subtarget &Subtarget) {
17835 SDValue Vec = Op.getOperand(0);
17836 SDLoc dl(Vec);
17837 MVT VecVT = Vec.getSimpleValueType();
17838 SDValue Idx = Op.getOperand(1);
17839 MVT EltVT = Op.getSimpleValueType();
17840
17841 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI(
)) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17842, __PRETTY_FUNCTION__))
17842 "Unexpected vector type in ExtractBitFromMaskVector")(((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI(
)) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? static_cast<void> (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17842, __PRETTY_FUNCTION__))
;
17843
17844 // variable index can't be handled in mask registers,
17845 // extend vector to VR512/128
17846 if (!isa<ConstantSDNode>(Idx)) {
17847 unsigned NumElts = VecVT.getVectorNumElements();
17848 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17849 // than extending to 128/256bit.
17850 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17851 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17852 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17853 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17854 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17855 }
17856
17857 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
17858 if (IdxVal == 0) // the operation is legal
17859 return Op;
17860
17861 // Extend to natively supported kshift.
17862 unsigned NumElems = VecVT.getVectorNumElements();
17863 MVT WideVecVT = VecVT;
17864 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
17865 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
17866 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
17867 DAG.getUNDEF(WideVecVT), Vec,
17868 DAG.getIntPtrConstant(0, dl));
17869 }
17870
17871 // Use kshiftr instruction to move to the lower element.
17872 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
17873 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17874
17875 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17876 DAG.getIntPtrConstant(0, dl));
17877}
17878
17879SDValue
17880X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
17881 SelectionDAG &DAG) const {
17882 SDLoc dl(Op);
17883 SDValue Vec = Op.getOperand(0);
17884 MVT VecVT = Vec.getSimpleValueType();
17885 SDValue Idx = Op.getOperand(1);
17886
17887 if (VecVT.getVectorElementType() == MVT::i1)
17888 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
17889
17890 if (!isa<ConstantSDNode>(Idx)) {
17891 // Its more profitable to go through memory (1 cycles throughput)
17892 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
17893 // IACA tool was used to get performance estimation
17894 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
17895 //
17896 // example : extractelement <16 x i8> %a, i32 %i
17897 //
17898 // Block Throughput: 3.00 Cycles
17899 // Throughput Bottleneck: Port5
17900 //
17901 // | Num Of | Ports pressure in cycles | |
17902 // | Uops | 0 - DV | 5 | 6 | 7 | |
17903 // ---------------------------------------------
17904 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
17905 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
17906 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
17907 // Total Num Of Uops: 4
17908 //
17909 //
17910 // Block Throughput: 1.00 Cycles
17911 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
17912 //
17913 // | | Ports pressure in cycles | |
17914 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
17915 // ---------------------------------------------------------
17916 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
17917 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
17918 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
17919 // Total Num Of Uops: 4
17920
17921 return SDValue();
17922 }
17923
17924 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
17925
17926 // If this is a 256-bit vector result, first extract the 128-bit vector and
17927 // then extract the element from the 128-bit vector.
17928 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
17929 // Get the 128-bit vector.
17930 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
17931 MVT EltVT = VecVT.getVectorElementType();
17932
17933 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
17934 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")((isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17934, __PRETTY_FUNCTION__))
;
17935
17936 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
17937 // this can be done with a mask.
17938 IdxVal &= ElemsPerChunk - 1;
17939 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17940 DAG.getIntPtrConstant(IdxVal, dl));
17941 }
17942
17943 assert(VecVT.is128BitVector() && "Unexpected vector length")((VecVT.is128BitVector() && "Unexpected vector length"
) ? static_cast<void> (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 17943, __PRETTY_FUNCTION__))
;
17944
17945 MVT VT = Op.getSimpleValueType();
17946
17947 if (VT.getSizeInBits() == 16) {
17948 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
17949 // we're going to zero extend the register or fold the store (SSE41 only).
17950 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
17951 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
17952 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
17953 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17954 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17955
17956 // Transform it so it match pextrw which produces a 32-bit result.
17957 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
17958 Op.getOperand(0), Op.getOperand(1));
17959 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17960 }
17961
17962 if (Subtarget.hasSSE41())
17963 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
17964 return Res;
17965
17966 // TODO: We only extract a single element from v16i8, we can probably afford
17967 // to be more aggressive here before using the default approach of spilling to
17968 // stack.
17969 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
17970 // Extract either the lowest i32 or any i16, and extract the sub-byte.
17971 int DWordIdx = IdxVal / 4;
17972 if (DWordIdx == 0) {
17973 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17974 DAG.getBitcast(MVT::v4i32, Vec),
17975 DAG.getIntPtrConstant(DWordIdx, dl));
17976 int ShiftVal = (IdxVal % 4) * 8;
17977 if (ShiftVal != 0)
17978 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
17979 DAG.getConstant(ShiftVal, dl, MVT::i8));
17980 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17981 }
17982
17983 int WordIdx = IdxVal / 2;
17984 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
17985 DAG.getBitcast(MVT::v8i16, Vec),
17986 DAG.getIntPtrConstant(WordIdx, dl));
17987 int ShiftVal = (IdxVal % 2) * 8;
17988 if (ShiftVal != 0)
17989 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
17990 DAG.getConstant(ShiftVal, dl, MVT::i8));
17991 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
17992 }
17993
17994 if (VT.getSizeInBits() == 32) {
17995 if (IdxVal == 0)
17996 return Op;
17997
17998 // SHUFPS the element to the lowest double word, then movss.
17999 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
18000 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18001 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18002 DAG.getIntPtrConstant(0, dl));
18003 }
18004
18005 if (VT.getSizeInBits() == 64) {
18006 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18007 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18008 // to match extract_elt for f64.
18009 if (IdxVal == 0)
18010 return Op;
18011
18012 // UNPCKHPD the element to the lowest double word, then movsd.
18013 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18014 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18015 int Mask[2] = { 1, -1 };
18016 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18017 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18018 DAG.getIntPtrConstant(0, dl));
18019 }
18020
18021 return SDValue();
18022}
18023
18024/// Insert one bit to mask vector, like v16i1 or v8i1.
18025/// AVX-512 feature.
18026static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
18027 const X86Subtarget &Subtarget) {
18028 SDLoc dl(Op);
18029 SDValue Vec = Op.getOperand(0);
18030 SDValue Elt = Op.getOperand(1);
18031 SDValue Idx = Op.getOperand(2);
18032 MVT VecVT = Vec.getSimpleValueType();
18033
18034 if (!isa<ConstantSDNode>(Idx)) {
18035 // Non constant index. Extend source and destination,
18036 // insert element and then truncate the result.
18037 unsigned NumElts = VecVT.getVectorNumElements();
18038 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18039 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18040 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18041 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18042 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18043 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18044 }
18045
18046 // Copy into a k-register, extract to v1i1 and insert_subvector.
18047 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18048
18049 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
18050 Op.getOperand(2));
18051}
18052
18053SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18054 SelectionDAG &DAG) const {
18055 MVT VT = Op.getSimpleValueType();
18056 MVT EltVT = VT.getVectorElementType();
18057 unsigned NumElts = VT.getVectorNumElements();
18058
18059 if (EltVT == MVT::i1)
18060 return InsertBitToMaskVector(Op, DAG, Subtarget);
18061
18062 SDLoc dl(Op);
18063 SDValue N0 = Op.getOperand(0);
18064 SDValue N1 = Op.getOperand(1);
18065 SDValue N2 = Op.getOperand(2);
18066
18067 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18068 if (!N2C || N2C->getAPIntValue().uge(NumElts))
18069 return SDValue();
18070 uint64_t IdxVal = N2C->getZExtValue();
18071
18072 bool IsZeroElt = X86::isZeroNode(N1);
18073 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18074
18075 // If we are inserting a element, see if we can do this more efficiently with
18076 // a blend shuffle with a rematerializable vector than a costly integer
18077 // insertion.
18078 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
18079 16 <= EltVT.getSizeInBits()) {
18080 SmallVector<int, 8> BlendMask;
18081 for (unsigned i = 0; i != NumElts; ++i)
18082 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18083 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18084 : getOnesVector(VT, DAG, dl);
18085 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18086 }
18087
18088 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18089 // into that, and then insert the subvector back into the result.
18090 if (VT.is256BitVector() || VT.is512BitVector()) {
18091 // With a 256-bit vector, we can insert into the zero element efficiently
18092 // using a blend if we have AVX or AVX2 and the right data type.
18093 if (VT.is256BitVector() && IdxVal == 0) {
18094 // TODO: It is worthwhile to cast integer to floating point and back
18095 // and incur a domain crossing penalty if that's what we'll end up
18096 // doing anyway after extracting to a 128-bit vector.
18097 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18098 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
18099 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18100 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18101 DAG.getTargetConstant(1, dl, MVT::i8));
18102 }
18103 }
18104
18105 // Get the desired 128-bit vector chunk.
18106 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18107
18108 // Insert the element into the desired chunk.
18109 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
18110 assert(isPowerOf2_32(NumEltsIn128))((isPowerOf2_32(NumEltsIn128)) ? static_cast<void> (0) :
__assert_fail ("isPowerOf2_32(NumEltsIn128)", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18110, __PRETTY_FUNCTION__))
;
18111 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18112 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18113
18114 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18115 DAG.getIntPtrConstant(IdxIn128, dl));
18116
18117 // Insert the changed part back into the bigger vector
18118 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18119 }
18120 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")((VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18120, __PRETTY_FUNCTION__))
;
18121
18122 // This will be just movd/movq/movss/movsd.
18123 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18124 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18125 EltVT == MVT::i64) {
18126 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18127 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18128 }
18129
18130 // We can't directly insert an i8 or i16 into a vector, so zero extend
18131 // it to i32 first.
18132 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18133 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18134 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
18135 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18136 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18137 return DAG.getBitcast(VT, N1);
18138 }
18139 }
18140
18141 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18142 // argument. SSE41 required for pinsrb.
18143 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18144 unsigned Opc;
18145 if (VT == MVT::v8i16) {
18146 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")((Subtarget.hasSSE2() && "SSE2 required for PINSRW") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18146, __PRETTY_FUNCTION__))
;
18147 Opc = X86ISD::PINSRW;
18148 } else {
18149 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")((VT == MVT::v16i8 && "PINSRB requires v16i8 vector")
? static_cast<void> (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18149, __PRETTY_FUNCTION__))
;
18150 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")((Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18150, __PRETTY_FUNCTION__))
;
18151 Opc = X86ISD::PINSRB;
18152 }
18153
18154 if (N1.getValueType() != MVT::i32)
18155 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18156 if (N2.getValueType() != MVT::i32)
18157 N2 = DAG.getIntPtrConstant(IdxVal, dl);
18158 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18159 }
18160
18161 if (Subtarget.hasSSE41()) {
18162 if (EltVT == MVT::f32) {
18163 // Bits [7:6] of the constant are the source select. This will always be
18164 // zero here. The DAG Combiner may combine an extract_elt index into
18165 // these bits. For example (insert (extract, 3), 2) could be matched by
18166 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18167 // Bits [5:4] of the constant are the destination select. This is the
18168 // value of the incoming immediate.
18169 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18170 // combine either bitwise AND or insert of float 0.0 to set these bits.
18171
18172 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18173 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
18174 // If this is an insertion of 32-bits into the low 32-bits of
18175 // a vector, we prefer to generate a blend with immediate rather
18176 // than an insertps. Blends are simpler operations in hardware and so
18177 // will always have equal or better performance than insertps.
18178 // But if optimizing for size and there's a load folding opportunity,
18179 // generate insertps because blendps does not have a 32-bit memory
18180 // operand form.
18181 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18182 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18183 DAG.getTargetConstant(1, dl, MVT::i8));
18184 }
18185 // Create this as a scalar to vector..
18186 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18187 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18188 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18189 }
18190
18191 // PINSR* works with constant index.
18192 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18193 return Op;
18194 }
18195
18196 return SDValue();
18197}
18198
18199static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
18200 SelectionDAG &DAG) {
18201 SDLoc dl(Op);
18202 MVT OpVT = Op.getSimpleValueType();
18203
18204 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18205 // combines.
18206 if (X86::isZeroNode(Op.getOperand(0)))
18207 return getZeroVector(OpVT, Subtarget, DAG, dl);
18208
18209 // If this is a 256-bit vector result, first insert into a 128-bit
18210 // vector and then insert into the 256-bit vector.
18211 if (!OpVT.is128BitVector()) {
18212 // Insert into a 128-bit vector.
18213 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18214 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
18215 OpVT.getVectorNumElements() / SizeFactor);
18216
18217 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18218
18219 // Insert the 128-bit vector.
18220 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18221 }
18222 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&((OpVT.is128BitVector() && OpVT.isInteger() &&
OpVT != MVT::v2i64 && "Expected an SSE type!") ? static_cast
<void> (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18223, __PRETTY_FUNCTION__))
18223 "Expected an SSE type!")((OpVT.is128BitVector() && OpVT.isInteger() &&
OpVT != MVT::v2i64 && "Expected an SSE type!") ? static_cast
<void> (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18223, __PRETTY_FUNCTION__))
;
18224
18225 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
18226 if (OpVT == MVT::v4i32)
18227 return Op;
18228
18229 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18230 return DAG.getBitcast(
18231 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18232}
18233
18234// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18235// simple superregister reference or explicit instructions to insert
18236// the upper bits of a vector.
18237static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
18238 SelectionDAG &DAG) {
18239 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)((Op.getSimpleValueType().getVectorElementType() == MVT::i1) ?
static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18239, __PRETTY_FUNCTION__))
;
18240
18241 return insert1BitVector(Op, DAG, Subtarget);
18242}
18243
18244static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
18245 SelectionDAG &DAG) {
18246 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&((Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering") ? static_cast
<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18247, __PRETTY_FUNCTION__))
18247 "Only vXi1 extract_subvectors need custom lowering")((Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
"Only vXi1 extract_subvectors need custom lowering") ? static_cast
<void> (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18247, __PRETTY_FUNCTION__))
;
18248
18249 SDLoc dl(Op);
18250 SDValue Vec = Op.getOperand(0);
18251 SDValue Idx = Op.getOperand(1);
18252
18253 if (!isa<ConstantSDNode>(Idx))
18254 return SDValue();
18255
18256 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
18257 if (IdxVal == 0) // the operation is legal
18258 return Op;
18259
18260 MVT VecVT = Vec.getSimpleValueType();
18261 unsigned NumElems = VecVT.getVectorNumElements();
18262
18263 // Extend to natively supported kshift.
18264 MVT WideVecVT = VecVT;
18265 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
18266 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
18267 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
18268 DAG.getUNDEF(WideVecVT), Vec,
18269 DAG.getIntPtrConstant(0, dl));
18270 }
18271
18272 // Shift to the LSB.
18273 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
18274 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18275
18276 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18277 DAG.getIntPtrConstant(0, dl));
18278}
18279
18280// Returns the appropriate wrapper opcode for a global reference.
18281unsigned X86TargetLowering::getGlobalWrapperKind(
18282 const GlobalValue *GV, const unsigned char OpFlags) const {
18283 // References to absolute symbols are never PC-relative.
18284 if (GV && GV->isAbsoluteSymbolRef())
18285 return X86ISD::Wrapper;
18286
18287 CodeModel::Model M = getTargetMachine().getCodeModel();
18288 if (Subtarget.isPICStyleRIPRel() &&
18289 (M == CodeModel::Small || M == CodeModel::Kernel))
18290 return X86ISD::WrapperRIP;
18291
18292 // GOTPCREL references must always use RIP.
18293 if (OpFlags == X86II::MO_GOTPCREL)
18294 return X86ISD::WrapperRIP;
18295
18296 return X86ISD::Wrapper;
18297}
18298
18299// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18300// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18301// one of the above mentioned nodes. It has to be wrapped because otherwise
18302// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18303// be used to form addressing mode. These wrapped nodes will be selected
18304// into MOV32ri.
18305SDValue
18306X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18307 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18308
18309 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18310 // global base reg.
18311 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18312
18313 auto PtrVT = getPointerTy(DAG.getDataLayout());
18314 SDValue Result = DAG.getTargetConstantPool(
18315 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
18316 SDLoc DL(CP);
18317 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
18318 // With PIC, the address is actually $g + Offset.
18319 if (OpFlag) {
18320 Result =
18321 DAG.getNode(ISD::ADD, DL, PtrVT,
18322 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18323 }
18324
18325 return Result;
18326}
18327
18328SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18329 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18330
18331 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18332 // global base reg.
18333 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18334
18335 auto PtrVT = getPointerTy(DAG.getDataLayout());
18336 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18337 SDLoc DL(JT);
18338 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
18339
18340 // With PIC, the address is actually $g + Offset.
18341 if (OpFlag)
18342 Result =
18343 DAG.getNode(ISD::ADD, DL, PtrVT,
18344 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18345
18346 return Result;
18347}
18348
18349SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18350 SelectionDAG &DAG) const {
18351 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18352}
18353
18354SDValue
18355X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18356 // Create the TargetBlockAddressAddress node.
18357 unsigned char OpFlags =
18358 Subtarget.classifyBlockAddressReference();
18359 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18360 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18361 SDLoc dl(Op);
18362 auto PtrVT = getPointerTy(DAG.getDataLayout());
18363 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18364 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
18365
18366 // With PIC, the address is actually $g + Offset.
18367 if (isGlobalRelativeToPICBase(OpFlags)) {
18368 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18369 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18370 }
18371
18372 return Result;
18373}
18374
18375/// Creates target global address or external symbol nodes for calls or
18376/// other uses.
18377SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18378 bool ForCall) const {
18379 // Unpack the global address or external symbol.
18380 const SDLoc &dl = SDLoc(Op);
18381 const GlobalValue *GV = nullptr;
18382 int64_t Offset = 0;
18383 const char *ExternalSym = nullptr;
18384 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18385 GV = G->getGlobal();
18386 Offset = G->getOffset();
18387 } else {
18388 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18389 ExternalSym = ES->getSymbol();
18390 }
18391
18392 // Calculate some flags for address lowering.
18393 const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();
18394 unsigned char OpFlags;
18395 if (ForCall)
18396 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18397 else
18398 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18399 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18400 bool NeedsLoad = isGlobalStubReference(OpFlags);
18401
18402 CodeModel::Model M = DAG.getTarget().getCodeModel();
18403 auto PtrVT = getPointerTy(DAG.getDataLayout());
18404 SDValue Result;
18405
18406 if (GV) {
18407 // Create a target global address if this is a global. If possible, fold the
18408 // offset into the global address reference. Otherwise, ADD it on later.
18409 int64_t GlobalOffset = 0;
18410 if (OpFlags == X86II::MO_NO_FLAG &&
18411 X86::isOffsetSuitableForCodeModel(Offset, M)) {
18412 std::swap(GlobalOffset, Offset);
18413 }
18414 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18415 } else {
18416 // If this is not a global address, this must be an external symbol.
18417 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18418 }
18419
18420 // If this is a direct call, avoid the wrapper if we don't need to do any
18421 // loads or adds. This allows SDAG ISel to match direct calls.
18422 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18423 return Result;
18424
18425 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18426
18427 // With PIC, the address is actually $g + Offset.
18428 if (HasPICReg) {
18429 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18430 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18431 }
18432
18433 // For globals that require a load from a stub to get the address, emit the
18434 // load.
18435 if (NeedsLoad)
18436 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18437 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
18438
18439 // If there was a non-zero offset that we didn't fold, create an explicit
18440 // addition for it.
18441 if (Offset != 0)
18442 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
18443 DAG.getConstant(Offset, dl, PtrVT));
18444
18445 return Result;
18446}
18447
18448SDValue
18449X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
18450 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18451}
18452
18453static SDValue
18454GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
18455 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
18456 unsigned char OperandFlags, bool LocalDynamic = false) {
18457 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
18458 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18459 SDLoc dl(GA);
18460 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18461 GA->getValueType(0),
18462 GA->getOffset(),
18463 OperandFlags);
18464
18465 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
18466 : X86ISD::TLSADDR;
18467
18468 if (InFlag) {
18469 SDValue Ops[] = { Chain, TGA, *InFlag };
18470 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18471 } else {
18472 SDValue Ops[] = { Chain, TGA };
18473 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18474 }
18475
18476 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
18477 MFI.setAdjustsStack(true);
18478 MFI.setHasCalls(true);
18479
18480 SDValue Flag = Chain.getValue(1);
18481 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
18482}
18483
18484// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
18485static SDValue
18486LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18487 const EVT PtrVT) {
18488 SDValue InFlag;
18489 SDLoc dl(GA); // ? function entry point might be better
18490 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18491 DAG.getNode(X86ISD::GlobalBaseReg,
18492 SDLoc(), PtrVT), InFlag);
18493 InFlag = Chain.getValue(1);
18494
18495 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
18496}
18497
18498// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
18499static SDValue
18500LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18501 const EVT PtrVT) {
18502 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18503 X86::RAX, X86II::MO_TLSGD);
18504}
18505
18506static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
18507 SelectionDAG &DAG,
18508 const EVT PtrVT,
18509 bool is64Bit) {
18510 SDLoc dl(GA);
18511
18512 // Get the start address of the TLS block for this module.
18513 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
18514 .getInfo<X86MachineFunctionInfo>();
18515 MFI->incNumLocalDynamicTLSAccesses();
18516
18517 SDValue Base;
18518 if (is64Bit) {
18519 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
18520 X86II::MO_TLSLD, /*LocalDynamic=*/true);
18521 } else {
18522 SDValue InFlag;
18523 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18524 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
18525 InFlag = Chain.getValue(1);
18526 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
18527 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18528 }
18529
18530 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18531 // of Base.
18532
18533 // Build x@dtpoff.
18534 unsigned char OperandFlags = X86II::MO_DTPOFF;
18535 unsigned WrapperKind = X86ISD::Wrapper;
18536 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18537 GA->getValueType(0),
18538 GA->getOffset(), OperandFlags);
18539 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18540
18541 // Add x@dtpoff with the base.
18542 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18543}
18544
18545// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18546static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
18547 const EVT PtrVT, TLSModel::Model model,
18548 bool is64Bit, bool isPIC) {
18549 SDLoc dl(GA);
18550
18551 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18552 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
18553 is64Bit ? 257 : 256));
18554
18555 SDValue ThreadPointer =
18556 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18557 MachinePointerInfo(Ptr));
18558
18559 unsigned char OperandFlags = 0;
18560 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18561 // initialexec.
18562 unsigned WrapperKind = X86ISD::Wrapper;
18563 if (model == TLSModel::LocalExec) {
18564 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18565 } else if (model == TLSModel::InitialExec) {
18566 if (is64Bit) {
18567 OperandFlags = X86II::MO_GOTTPOFF;
18568 WrapperKind = X86ISD::WrapperRIP;
18569 } else {
18570 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18571 }
18572 } else {
18573 llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18573)
;
18574 }
18575
18576 // emit "addl x@ntpoff,%eax" (local exec)
18577 // or "addl x@indntpoff,%eax" (initial exec)
18578 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18579 SDValue TGA =
18580 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18581 GA->getOffset(), OperandFlags);
18582 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18583
18584 if (model == TLSModel::InitialExec) {
18585 if (isPIC && !is64Bit) {
18586 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18587 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18588 Offset);
18589 }
18590
18591 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18592 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
18593 }
18594
18595 // The address of the thread local variable is the add of the thread
18596 // pointer with the offset of the variable.
18597 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18598}
18599
18600SDValue
18601X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18602
18603 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18604
18605 if (DAG.getTarget().useEmulatedTLS())
18606 return LowerToTLSEmulatedModel(GA, DAG);
18607
18608 const GlobalValue *GV = GA->getGlobal();
18609 auto PtrVT = getPointerTy(DAG.getDataLayout());
18610 bool PositionIndependent = isPositionIndependent();
18611
18612 if (Subtarget.isTargetELF()) {
18613 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18614 switch (model) {
18615 case TLSModel::GeneralDynamic:
18616 if (Subtarget.is64Bit())
18617 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18618 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18619 case TLSModel::LocalDynamic:
18620 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
18621 Subtarget.is64Bit());
18622 case TLSModel::InitialExec:
18623 case TLSModel::LocalExec:
18624 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18625 PositionIndependent);
18626 }
18627 llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18627)
;
18628 }
18629
18630 if (Subtarget.isTargetDarwin()) {
18631 // Darwin only has one model of TLS. Lower to that.
18632 unsigned char OpFlag = 0;
18633 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
18634 X86ISD::WrapperRIP : X86ISD::Wrapper;
18635
18636 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18637 // global base reg.
18638 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18639 if (PIC32)
18640 OpFlag = X86II::MO_TLVP_PIC_BASE;
18641 else
18642 OpFlag = X86II::MO_TLVP;
18643 SDLoc DL(Op);
18644 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
18645 GA->getValueType(0),
18646 GA->getOffset(), OpFlag);
18647 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18648
18649 // With PIC32, the address is actually $g + Offset.
18650 if (PIC32)
18651 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18652 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18653 Offset);
18654
18655 // Lowering the machine isd will make sure everything is in the right
18656 // location.
18657 SDValue Chain = DAG.getEntryNode();
18658 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18659 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18660 SDValue Args[] = { Chain, Offset };
18661 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18662 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
18663 DAG.getIntPtrConstant(0, DL, true),
18664 Chain.getValue(1), DL);
18665
18666 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18667 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
18668 MFI.setAdjustsStack(true);
18669
18670 // And our return value (tls address) is in the standard call return value
18671 // location.
18672 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18673 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18674 }
18675
18676 if (Subtarget.isOSWindows()) {
18677 // Just use the implicit TLS architecture
18678 // Need to generate something similar to:
18679 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18680 // ; from TEB
18681 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
18682 // mov rcx, qword [rdx+rcx*8]
18683 // mov eax, .tls$:tlsvar
18684 // [rax+rcx] contains the address
18685 // Windows 64bit: gs:0x58
18686 // Windows 32bit: fs:__tls_array
18687
18688 SDLoc dl(GA);
18689 SDValue Chain = DAG.getEntryNode();
18690
18691 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18692 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18693 // use its literal value of 0x2C.
18694 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
18695 ? Type::getInt8PtrTy(*DAG.getContext(),
18696 256)
18697 : Type::getInt32PtrTy(*DAG.getContext(),
18698 257));
18699
18700 SDValue TlsArray = Subtarget.is64Bit()
18701 ? DAG.getIntPtrConstant(0x58, dl)
18702 : (Subtarget.isTargetWindowsGNU()
18703 ? DAG.getIntPtrConstant(0x2C, dl)
18704 : DAG.getExternalSymbol("_tls_array", PtrVT));
18705
18706 SDValue ThreadPointer =
18707 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18708
18709 SDValue res;
18710 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
18711 res = ThreadPointer;
18712 } else {
18713 // Load the _tls_index variable
18714 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18715 if (Subtarget.is64Bit())
18716 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18717 MachinePointerInfo(), MVT::i32);
18718 else
18719 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18720
18721 auto &DL = DAG.getDataLayout();
18722 SDValue Scale =
18723 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18724 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18725
18726 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18727 }
18728
18729 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18730
18731 // Get the offset of start of .tls section
18732 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18733 GA->getValueType(0),
18734 GA->getOffset(), X86II::MO_SECREL);
18735 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18736
18737 // The address of the thread local variable is the add of the thread
18738 // pointer with the offset of the variable.
18739 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
18740 }
18741
18742 llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18742)
;
18743}
18744
18745/// Lower SRA_PARTS and friends, which return two i32 values
18746/// and take a 2 x i32 value to shift plus a shift amount.
18747/// TODO: Can this be moved to general expansion code?
18748static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
18749 assert(Op.getNumOperands() == 3 && "Not a double-shift!")((Op.getNumOperands() == 3 && "Not a double-shift!") ?
static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == 3 && \"Not a double-shift!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18749, __PRETTY_FUNCTION__))
;
18750 MVT VT = Op.getSimpleValueType();
18751 unsigned VTBits = VT.getSizeInBits();
18752 SDLoc dl(Op);
18753 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
18754 SDValue ShOpLo = Op.getOperand(0);
18755 SDValue ShOpHi = Op.getOperand(1);
18756 SDValue ShAmt = Op.getOperand(2);
18757 // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
18758 // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
18759 // during isel.
18760 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
18761 DAG.getConstant(VTBits - 1, dl, MVT::i8));
18762 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
18763 DAG.getConstant(VTBits - 1, dl, MVT::i8))
18764 : DAG.getConstant(0, dl, VT);
18765
18766 SDValue Tmp2, Tmp3;
18767 if (Op.getOpcode() == ISD::SHL_PARTS) {
18768 Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
18769 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
18770 } else {
18771 Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
18772 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
18773 }
18774
18775 // If the shift amount is larger or equal than the width of a part we can't
18776 // rely on the results of shld/shrd. Insert a test and select the appropriate
18777 // values for large shift amounts.
18778 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
18779 DAG.getConstant(VTBits, dl, MVT::i8));
18780 SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
18781 DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
18782
18783 SDValue Hi, Lo;
18784 if (Op.getOpcode() == ISD::SHL_PARTS) {
18785 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
18786 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
18787 } else {
18788 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
18789 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
18790 }
18791
18792 return DAG.getMergeValues({ Lo, Hi }, dl);
18793}
18794
18795static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
18796 SelectionDAG &DAG) {
18797 MVT VT = Op.getSimpleValueType();
18798 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR
) && "Unexpected funnel shift opcode!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18799, __PRETTY_FUNCTION__))
18799 "Unexpected funnel shift opcode!")(((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR
) && "Unexpected funnel shift opcode!") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18799, __PRETTY_FUNCTION__))
;
18800
18801 SDLoc DL(Op);
18802 SDValue Op0 = Op.getOperand(0);
18803 SDValue Op1 = Op.getOperand(1);
18804 SDValue Amt = Op.getOperand(2);
18805
18806 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
18807
18808 if (VT.isVector()) {
18809 assert(Subtarget.hasVBMI2() && "Expected VBMI2")((Subtarget.hasVBMI2() && "Expected VBMI2") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasVBMI2() && \"Expected VBMI2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18809, __PRETTY_FUNCTION__))
;
18810
18811 if (IsFSHR)
18812 std::swap(Op0, Op1);
18813
18814 APInt APIntShiftAmt;
18815 if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
18816 uint64_t ShiftAmt = APIntShiftAmt.urem(VT.getScalarSizeInBits());
18817 return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, Op0,
18818 Op1, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18819 }
18820
18821 return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
18822 Op0, Op1, Amt);
18823 }
18824
18825 assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
"Unexpected funnel shift type!") ? static_cast<void> (
0) : __assert_fail ("(VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18826, __PRETTY_FUNCTION__))
18826 "Unexpected funnel shift type!")(((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
"Unexpected funnel shift type!") ? static_cast<void> (
0) : __assert_fail ("(VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18826, __PRETTY_FUNCTION__))
;
18827
18828 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
18829 bool OptForSize = DAG.shouldOptForSize();
18830 if (!OptForSize && Subtarget.isSHLDSlow())
18831 return SDValue();
18832
18833 if (IsFSHR)
18834 std::swap(Op0, Op1);
18835
18836 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
18837 if (VT == MVT::i16)
18838 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
18839 DAG.getConstant(15, DL, Amt.getValueType()));
18840
18841 unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
18842 return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
18843}
18844
18845// Try to use a packed vector operation to handle i64 on 32-bit targets when
18846// AVX512DQ is enabled.
18847static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
18848 const X86Subtarget &Subtarget) {
18849 assert((Op.getOpcode() == ISD::SINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18853, __PRETTY_FUNCTION__))
18850 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18853, __PRETTY_FUNCTION__))
18851 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18853, __PRETTY_FUNCTION__))
18852 Op.getOpcode() == ISD::UINT_TO_FP) &&(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18853, __PRETTY_FUNCTION__))
18853 "Unexpected opcode!")(((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD
::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP
|| Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18853, __PRETTY_FUNCTION__))
;
18854 bool IsStrict = Op->isStrictFPOpcode();
18855 unsigned OpNo = IsStrict ? 1 : 0;
18856 SDValue Src = Op.getOperand(OpNo);
18857 MVT SrcVT = Src.getSimpleValueType();
18858 MVT VT = Op.getSimpleValueType();
18859
18860 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
18861 (VT != MVT::f32 && VT != MVT::f64))
18862 return SDValue();
18863
18864 // Pack the i64 into a vector, do the operation and extract.
18865
18866 // Using 256-bit to ensure result is 128-bits for f32 case.
18867 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
18868 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
18869 MVT VecVT = MVT::getVectorVT(VT, NumElts);
18870
18871 SDLoc dl(Op);
18872 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
18873 if (IsStrict) {
18874 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
18875 {Op.getOperand(0), InVec});
18876 SDValue Chain = CvtVec.getValue(1);
18877 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18878 DAG.getIntPtrConstant(0, dl));
18879 return DAG.getMergeValues({Value, Chain}, dl);
18880 }
18881
18882 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
18883
18884 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
18885 DAG.getIntPtrConstant(0, dl));
18886}
18887
18888static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
18889 const X86Subtarget &Subtarget) {
18890 switch (Opcode) {
18891 case ISD::SINT_TO_FP:
18892 // TODO: Handle wider types with AVX/AVX512.
18893 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
18894 return false;
18895 // CVTDQ2PS or (V)CVTDQ2PD
18896 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
18897
18898 case ISD::UINT_TO_FP:
18899 // TODO: Handle wider types and i64 elements.
18900 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
18901 return false;
18902 // VCVTUDQ2PS or VCVTUDQ2PD
18903 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
18904
18905 default:
18906 return false;
18907 }
18908}
18909
18910/// Given a scalar cast operation that is extracted from a vector, try to
18911/// vectorize the cast op followed by extraction. This will avoid an expensive
18912/// round-trip between XMM and GPR.
18913static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
18914 const X86Subtarget &Subtarget) {
18915 // TODO: This could be enhanced to handle smaller integer types by peeking
18916 // through an extend.
18917 SDValue Extract = Cast.getOperand(0);
18918 MVT DestVT = Cast.getSimpleValueType();
18919 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
18920 !isa<ConstantSDNode>(Extract.getOperand(1)))
18921 return SDValue();
18922
18923 // See if we have a 128-bit vector cast op for this type of cast.
18924 SDValue VecOp = Extract.getOperand(0);
18925 MVT FromVT = VecOp.getSimpleValueType();
18926 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
18927 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
18928 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
18929 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
18930 return SDValue();
18931
18932 // If we are extracting from a non-zero element, first shuffle the source
18933 // vector to allow extracting from element zero.
18934 SDLoc DL(Cast);
18935 if (!isNullConstant(Extract.getOperand(1))) {
18936 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
18937 Mask[0] = Extract.getConstantOperandVal(1);
18938 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
18939 }
18940 // If the source vector is wider than 128-bits, extract the low part. Do not
18941 // create an unnecessarily wide vector cast op.
18942 if (FromVT != Vec128VT)
18943 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
18944
18945 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
18946 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
18947 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
18948 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
18949 DAG.getIntPtrConstant(0, DL));
18950}
18951
18952static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
18953 const X86Subtarget &Subtarget) {
18954 SDLoc DL(Op);
18955 bool IsStrict = Op->isStrictFPOpcode();
18956 MVT VT = Op->getSimpleValueType(0);
18957 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
18958
18959 if (Subtarget.hasDQI()) {
18960 assert(!Subtarget.hasVLX() && "Unexpected features")((!Subtarget.hasVLX() && "Unexpected features") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18960, __PRETTY_FUNCTION__))
;
18961
18962 assert((Src.getSimpleValueType() == MVT::v2i64 ||(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18964, __PRETTY_FUNCTION__))
18963 Src.getSimpleValueType() == MVT::v4i64) &&(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18964, __PRETTY_FUNCTION__))
18964 "Unsupported custom type")(((Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType
() == MVT::v4i64) && "Unsupported custom type") ? static_cast
<void> (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18964, __PRETTY_FUNCTION__))
;
18965
18966 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
18967 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18968, __PRETTY_FUNCTION__))
18968 "Unexpected VT!")(((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 18968, __PRETTY_FUNCTION__))
;
18969 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
18970
18971 // Need to concat with zero vector for strict fp to avoid spurious
18972 // exceptions.
18973 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
18974 : DAG.getUNDEF(MVT::v8i64);
18975 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
18976 DAG.getIntPtrConstant(0, DL));
18977 SDValue Res, Chain;
18978 if (IsStrict) {
18979 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
18980 {Op->getOperand(0), Src});
18981 Chain = Res.getValue(1);
18982 } else {
18983 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
18984 }
18985
18986 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18987 DAG.getIntPtrConstant(0, DL));
18988
18989 if (IsStrict)
18990 return DAG.getMergeValues({Res, Chain}, DL);
18991 return Res;
18992 }
18993
18994 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
18995 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
18996 if (VT != MVT::v4f32 || IsSigned)
18997 return SDValue();
18998
18999 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19000 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19001 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19002 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19003 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19004 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19005 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19006 SmallVector<SDValue, 4> SignCvts(4);
19007 SmallVector<SDValue, 4> Chains(4);
19008 for (int i = 0; i != 4; ++i) {
19009 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19010 DAG.getIntPtrConstant(i, DL));
19011 if (IsStrict) {
19012 SignCvts[i] =
19013 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19014 {Op.getOperand(0), Elt});
19015 Chains[i] = SignCvts[i].getValue(1);
19016 } else {
19017 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19018 }
19019 }
19020 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19021
19022 SDValue Slow, Chain;
19023 if (IsStrict) {
19024 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19025 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19026 {Chain, SignCvt, SignCvt});
19027 Chain = Slow.getValue(1);
19028 } else {
19029 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19030 }
19031
19032 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19033 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19034
19035 if (IsStrict)
19036 return DAG.getMergeValues({Cvt, Chain}, DL);
19037
19038 return Cvt;
19039}
19040
19041SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19042 SelectionDAG &DAG) const {
19043 bool IsStrict = Op->isStrictFPOpcode();
19044 unsigned OpNo = IsStrict ? 1 : 0;
19045 SDValue Src = Op.getOperand(OpNo);
19046 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19047 MVT SrcVT = Src.getSimpleValueType();
19048 MVT VT = Op.getSimpleValueType();
19049 SDLoc dl(Op);
19050
19051 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19052 return Extract;
19053
19054 if (SrcVT.isVector()) {
19055 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19056 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19057 // source for strict FP.
19058 if (IsStrict)
19059 return DAG.getNode(
19060 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19061 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19062 DAG.getUNDEF(SrcVT))});
19063 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19064 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19065 DAG.getUNDEF(SrcVT)));
19066 }
19067 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19068 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19069
19070 return SDValue();
19071 }
19072
19073 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19074, __PRETTY_FUNCTION__))
19074 "Unknown SINT_TO_FP to lower!")((SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!") ? static_cast<void> (0
) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19074, __PRETTY_FUNCTION__))
;
19075
19076 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19077
19078 // These are really Legal; return the operand so the caller accepts it as
19079 // Legal.
19080 if (SrcVT == MVT::i32 && UseSSEReg)
19081 return Op;
19082 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19083 return Op;
19084
19085 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
19086 return V;
19087
19088 // SSE doesn't have an i16 conversion so we need to promote.
19089 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19090 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19091 if (IsStrict)
19092 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19093 {Chain, Ext});
19094
19095 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19096 }
19097
19098 if (VT == MVT::f128)
19099 return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
19100
19101 SDValue ValueToStore = Src;
19102 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19103 // Bitcasting to f64 here allows us to do a single 64-bit store from
19104 // an SSE register, avoiding the store forwarding penalty that would come
19105 // with two 32-bit stores.
19106 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19107
19108 unsigned Size = SrcVT.getStoreSize();
19109 MachineFunction &MF = DAG.getMachineFunction();
19110 auto PtrVT = getPointerTy(MF.getDataLayout());
19111 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
19112 MachinePointerInfo MPI =
19113 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
19114 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19115 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Size);
19116 std::pair<SDValue, SDValue> Tmp =
19117 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Size, DAG);
19118
19119 if (IsStrict)
19120 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19121
19122 return Tmp.first;
19123}
19124
19125std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19126 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19127 MachinePointerInfo PtrInfo, unsigned Align, SelectionDAG &DAG) const {
19128 // Build the FILD
19129 SDVTList Tys;
19130 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19131 if (useSSE)
19132 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19133 else
19134 Tys = DAG.getVTList(DstVT, MVT::Other);
19135
19136 SDValue FILDOps[] = {Chain, Pointer};
19137 SDValue Result =
19138 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19139 Align, MachineMemOperand::MOLoad);
19140 Chain = Result.getValue(1);
19141
19142 if (useSSE) {
19143 MachineFunction &MF = DAG.getMachineFunction();
19144 unsigned SSFISize = DstVT.getStoreSize();
19145 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
19146 auto PtrVT = getPointerTy(MF.getDataLayout());
19147 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19148 Tys = DAG.getVTList(MVT::Other);
19149 SDValue FSTOps[] = {Chain, Result, StackSlot};
19150 MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
19151 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
19152 MachineMemOperand::MOStore, SSFISize, SSFISize);
19153
19154 Chain =
19155 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19156 Result = DAG.getLoad(
19157 DstVT, DL, Chain, StackSlot,
19158 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
19159 Chain = Result.getValue(1);
19160 }
19161
19162 return { Result, Chain };
19163}
19164
19165/// Horizontal vector math instructions may be slower than normal math with
19166/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19167/// implementation, and likely shuffle complexity of the alternate sequence.
19168static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19169 const X86Subtarget &Subtarget) {
19170 bool IsOptimizingSize = DAG.shouldOptForSize();
19171 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19172 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19173}
19174
19175/// 64-bit unsigned integer to double expansion.
19176static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
19177 const X86Subtarget &Subtarget) {
19178 // This algorithm is not obvious. Here it is what we're trying to output:
19179 /*
19180 movq %rax, %xmm0
19181 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19182 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19183 #ifdef __SSE3__
19184 haddpd %xmm0, %xmm0
19185 #else
19186 pshufd $0x4e, %xmm0, %xmm1
19187 addpd %xmm1, %xmm0
19188 #endif
19189 */
19190
19191 bool IsStrict = Op->isStrictFPOpcode();
19192 unsigned OpNo = IsStrict ? 1 : 0;
19193 SDLoc dl(Op);
19194 LLVMContext *Context = DAG.getContext();
19195
19196 // Build some magic constants.
19197 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19198 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19199 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19200 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
19201
19202 SmallVector<Constant*,2> CV1;
19203 CV1.push_back(
19204 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19205 APInt(64, 0x4330000000000000ULL))));
19206 CV1.push_back(
19207 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19208 APInt(64, 0x4530000000000000ULL))));
19209 Constant *C1 = ConstantVector::get(CV1);
19210 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
19211
19212 // Load the 64-bit value into an XMM register.
19213 SDValue XR1 =
19214 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
19215 SDValue CLod0 =
19216 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19217 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
19218 /* Alignment = */ 16);
19219 SDValue Unpck1 =
19220 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19221
19222 SDValue CLod1 =
19223 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19224 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
19225 /* Alignment = */ 16);
19226 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19227 SDValue Sub;
19228 SDValue Chain;
19229 // TODO: Are there any fast-math-flags to propagate here?
19230 if (IsStrict) {
19231 Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
19232 {Op.getOperand(0), XR2F, CLod1});
19233 Chain = Sub.getValue(1);
19234 } else
19235 Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19236 SDValue Result;
19237
19238 if (!IsStrict && Subtarget.hasSSE3() &&
19239 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19240 // FIXME: Do we need a STRICT version of FHADD?
19241 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19242 } else {
19243 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19244 if (IsStrict) {
19245 Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
19246 {Chain, Shuffle, Sub});
19247 Chain = Result.getValue(1);
19248 } else
19249 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19250 }
19251 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19252 DAG.getIntPtrConstant(0, dl));
19253 if (IsStrict)
19254 return DAG.getMergeValues({Result, Chain}, dl);
19255
19256 return Result;
19257}
19258
19259/// 32-bit unsigned integer to float expansion.
19260static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
19261 const X86Subtarget &Subtarget) {
19262 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19263 SDLoc dl(Op);
19264 // FP constant to bias correct the final result.
19265 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
19266 MVT::f64);
19267
19268 // Load the 32-bit value into an XMM register.
19269 SDValue Load =
19270 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19271
19272 // Zero out the upper parts of the register.
19273 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19274
19275 // Or the load with the bias.
19276 SDValue Or = DAG.getNode(
19277 ISD::OR, dl, MVT::v2i64,
19278 DAG.getBitcast(MVT::v2i64, Load),
19279 DAG.getBitcast(MVT::v2i64,
19280 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19281 Or =
19282 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19283 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19284
19285 if (Op.getNode()->isStrictFPOpcode()) {
19286 // Subtract the bias.
19287 // TODO: Are there any fast-math-flags to propagate here?
19288 SDValue Chain = Op.getOperand(0);
19289 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19290 {Chain, Or, Bias});
19291
19292 if (Op.getValueType() == Sub.getValueType())
19293 return Sub;
19294
19295 // Handle final rounding.
19296 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19297 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19298
19299 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19300 }
19301
19302 // Subtract the bias.
19303 // TODO: Are there any fast-math-flags to propagate here?
19304 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19305
19306 // Handle final rounding.
19307 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19308}
19309
19310static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
19311 const X86Subtarget &Subtarget,
19312 const SDLoc &DL) {
19313 if (Op.getSimpleValueType() != MVT::v2f64)
19314 return SDValue();
19315
19316 bool IsStrict = Op->isStrictFPOpcode();
19317
19318 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19319 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")((N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19319, __PRETTY_FUNCTION__))
;
19320
19321 if (Subtarget.hasAVX512()) {
19322 if (!Subtarget.hasVLX()) {
19323 // Let generic type legalization widen this.
19324 if (!IsStrict)
19325 return SDValue();
19326 // Otherwise pad the integer input with 0s and widen the operation.
19327 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19328 DAG.getConstant(0, DL, MVT::v2i32));
19329 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19330 {Op.getOperand(0), N0});
19331 SDValue Chain = Res.getValue(1);
19332 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19333 DAG.getIntPtrConstant(0, DL));
19334 return DAG.getMergeValues({Res, Chain}, DL);
19335 }
19336
19337 // Legalize to v4i32 type.
19338 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19339 DAG.getUNDEF(MVT::v2i32));
19340 if (IsStrict)
19341 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
19342 {Op.getOperand(0), N0});
19343 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
19344 }
19345
19346 // Zero extend to 2i64, OR with the floating point representation of 2^52.
19347 // This gives us the floating point equivalent of 2^52 + the i32 integer
19348 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19349 // point leaving just our i32 integers in double format.
19350 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
19351 SDValue VBias =
19352 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
19353 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
19354 DAG.getBitcast(MVT::v2i64, VBias));
19355 Or = DAG.getBitcast(MVT::v2f64, Or);
19356
19357 if (IsStrict)
19358 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
19359 {Op.getOperand(0), Or, VBias});
19360 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
19361}
19362
19363static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
19364 const X86Subtarget &Subtarget) {
19365 SDLoc DL(Op);
19366 bool IsStrict = Op->isStrictFPOpcode();
19367 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19368 MVT VecIntVT = V.getSimpleValueType();
19369 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19370, __PRETTY_FUNCTION__))
19370 "Unsupported custom type")(((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
"Unsupported custom type") ? static_cast<void> (0) : __assert_fail
("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19370, __PRETTY_FUNCTION__))
;
19371
19372 if (Subtarget.hasAVX512()) {
19373 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19374 assert(!Subtarget.hasVLX() && "Unexpected features")((!Subtarget.hasVLX() && "Unexpected features") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19374, __PRETTY_FUNCTION__))
;
19375 MVT VT = Op->getSimpleValueType(0);
19376
19377 // v8i32->v8f64 is legal with AVX512 so just return it.
19378 if (VT == MVT::v8f64)
19379 return Op;
19380
19381 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19382, __PRETTY_FUNCTION__))
19382 "Unexpected VT!")(((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
"Unexpected VT!") ? static_cast<void> (0) : __assert_fail
("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19382, __PRETTY_FUNCTION__))
;
19383 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
19384 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
19385 // Need to concat with zero vector for strict fp to avoid spurious
19386 // exceptions.
19387 SDValue Tmp =
19388 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
19389 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
19390 DAG.getIntPtrConstant(0, DL));
19391 SDValue Res, Chain;
19392 if (IsStrict) {
19393 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
19394 {Op->getOperand(0), V});
19395 Chain = Res.getValue(1);
19396 } else {
19397 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
19398 }
19399
19400 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19401 DAG.getIntPtrConstant(0, DL));
19402
19403 if (IsStrict)
19404 return DAG.getMergeValues({Res, Chain}, DL);
19405 return Res;
19406 }
19407
19408 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
19409 Op->getSimpleValueType(0) == MVT::v4f64) {
19410 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
19411 Constant *Bias = ConstantFP::get(
19412 *DAG.getContext(),
19413 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
19414 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19415 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8);
19416 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
19417 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
19418 SDValue VBias = DAG.getMemIntrinsicNode(
19419 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
19420 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
19421 /*Alignment*/ 8, MachineMemOperand::MOLoad);
19422
19423 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
19424 DAG.getBitcast(MVT::v4i64, VBias));
19425 Or = DAG.getBitcast(MVT::v4f64, Or);
19426
19427 if (IsStrict)
19428 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
19429 {Op.getOperand(0), Or, VBias});
19430 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
19431 }
19432
19433 // The algorithm is the following:
19434 // #ifdef __SSE4_1__
19435 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19436 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19437 // (uint4) 0x53000000, 0xaa);
19438 // #else
19439 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19440 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19441 // #endif
19442 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19443 // return (float4) lo + fhi;
19444
19445 bool Is128 = VecIntVT == MVT::v4i32;
19446 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
19447 // If we convert to something else than the supported type, e.g., to v4f64,
19448 // abort early.
19449 if (VecFloatVT != Op->getSimpleValueType(0))
19450 return SDValue();
19451
19452 // In the #idef/#else code, we have in common:
19453 // - The vector of constants:
19454 // -- 0x4b000000
19455 // -- 0x53000000
19456 // - A shift:
19457 // -- v >> 16
19458
19459 // Create the splat vector for 0x4b000000.
19460 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
19461 // Create the splat vector for 0x53000000.
19462 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
19463
19464 // Create the right shift.
19465 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
19466 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
19467
19468 SDValue Low, High;
19469 if (Subtarget.hasSSE41()) {
19470 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
19471 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19472 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
19473 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
19474 // Low will be bitcasted right away, so do not bother bitcasting back to its
19475 // original type.
19476 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
19477 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19478 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19479 // (uint4) 0x53000000, 0xaa);
19480 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
19481 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
19482 // High will be bitcasted right away, so do not bother bitcasting back to
19483 // its original type.
19484 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
19485 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19486 } else {
19487 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
19488 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19489 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
19490 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
19491
19492 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19493 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
19494 }
19495
19496 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
19497 SDValue VecCstFSub = DAG.getConstantFP(
19498 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
19499
19500 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19501 // NOTE: By using fsub of a positive constant instead of fadd of a negative
19502 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19503 // enabled. See PR24512.
19504 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
19505 // TODO: Are there any fast-math-flags to propagate here?
19506 // (float4) lo;
19507 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
19508 // return (float4) lo + fhi;
19509 if (IsStrict) {
19510 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
19511 {Op.getOperand(0), HighBitcast, VecCstFSub});
19512 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
19513 {FHigh.getValue(1), LowBitcast, FHigh});
19514 }
19515
19516 SDValue FHigh =
19517 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
19518 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
19519}
19520
19521static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
19522 const X86Subtarget &Subtarget) {
19523 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19524 SDValue N0 = Op.getOperand(OpNo);
19525 MVT SrcVT = N0.getSimpleValueType();
19526 SDLoc dl(Op);
19527
19528 switch (SrcVT.SimpleTy) {
19529 default:
19530 llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19530)
;
19531 case MVT::v2i32:
19532 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
19533 case MVT::v4i32:
19534 case MVT::v8i32:
19535 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
19536 case MVT::v2i64:
19537 case MVT::v4i64:
19538 return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
19539 }
19540}
19541
19542SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
19543 SelectionDAG &DAG) const {
19544 bool IsStrict = Op->isStrictFPOpcode();
19545 unsigned OpNo = IsStrict ? 1 : 0;
19546 SDValue Src = Op.getOperand(OpNo);
19547 SDLoc dl(Op);
19548 auto PtrVT = getPointerTy(DAG.getDataLayout());
19549 MVT SrcVT = Src.getSimpleValueType();
19550 MVT DstVT = Op->getSimpleValueType(0);
19551 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19552
19553 if (DstVT == MVT::f128)
19554 return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
19555
19556 if (DstVT.isVector())
19557 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
19558
19559 if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
19560 return Extract;
19561
19562 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
19563 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
19564 // Conversions from unsigned i32 to f32/f64 are legal,
19565 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
19566 return Op;
19567 }
19568
19569 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19570 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
19571 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
19572 if (IsStrict)
19573 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
19574 {Chain, Src});
19575 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
19576 }
19577
19578 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
19579 return V;
19580
19581 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
19582 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
19583 if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
19584 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
19585 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
19586 return SDValue();
19587
19588 // Make a 64-bit buffer, and use it to build an FILD.
19589 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
19590 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19591 MachinePointerInfo MPI =
19592 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
19593 if (SrcVT == MVT::i32) {
19594 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
19595 SDValue Store1 =
19596 DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /*Align*/);
19597 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
19598 OffsetSlot, MPI.getWithOffset(4), 4);
19599 std::pair<SDValue, SDValue> Tmp =
19600 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, 8, DAG);
19601 if (IsStrict)
19602 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19603
19604 return Tmp.first;
19605 }
19606
19607 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")((SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? static_cast<void> (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19607, __PRETTY_FUNCTION__))
;
19608 SDValue ValueToStore = Src;
19609 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
19610 // Bitcasting to f64 here allows us to do a single 64-bit store from
19611 // an SSE register, avoiding the store forwarding penalty that would come
19612 // with two 32-bit stores.
19613 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19614 }
19615 SDValue Store =
19616 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, 8 /*Align*/);
19617 // For i64 source, we need to add the appropriate power of 2 if the input
19618 // was negative. This is the same as the optimization in
19619 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
19620 // we must be careful to do the computation in x87 extended precision, not
19621 // in SSE. (The generic code can't know it's OK to do this, or how to.)
19622 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19623 SDValue Ops[] = { Store, StackSlot };
19624 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
19625 MVT::i64, MPI, 8 /*Align*/,
19626 MachineMemOperand::MOLoad);
19627 Chain = Fild.getValue(1);
19628
19629
19630 // Check whether the sign bit is set.
19631 SDValue SignSet = DAG.getSetCC(
19632 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
19633 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
19634
19635 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
19636 APInt FF(64, 0x5F80000000000000ULL);
19637 SDValue FudgePtr = DAG.getConstantPool(
19638 ConstantInt::get(*DAG.getContext(), FF), PtrVT);
19639 unsigned CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlignment();
19640
19641 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
19642 SDValue Zero = DAG.getIntPtrConstant(0, dl);
19643 SDValue Four = DAG.getIntPtrConstant(4, dl);
19644 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
19645 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
19646
19647 // Load the value out, extending it from f32 to f80.
19648 SDValue Fudge = DAG.getExtLoad(
19649 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
19650 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
19651 CPAlignment);
19652 Chain = Fudge.getValue(1);
19653 // Extend everything to 80 bits to force it to be done on x87.
19654 // TODO: Are there any fast-math-flags to propagate here?
19655 if (IsStrict) {
19656 SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
19657 {Chain, Fild, Fudge});
19658 // STRICT_FP_ROUND can't handle equal types.
19659 if (DstVT == MVT::f80)
19660 return Add;
19661 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
19662 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
19663 }
19664 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
19665 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
19666 DAG.getIntPtrConstant(0, dl));
19667}
19668
19669// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
19670// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
19671// just return an SDValue().
19672// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
19673// to i16, i32 or i64, and we lower it to a legal sequence and return the
19674// result.
19675SDValue
19676X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
19677 bool IsSigned, SDValue &Chain) const {
19678 bool IsStrict = Op->isStrictFPOpcode();
19679 SDLoc DL(Op);
19680
19681 EVT DstTy = Op.getValueType();
19682 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
19683 EVT TheVT = Value.getValueType();
19684 auto PtrVT = getPointerTy(DAG.getDataLayout());
19685
19686 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
19687 // f16 must be promoted before using the lowering in this routine.
19688 // fp128 does not use this lowering.
19689 return SDValue();
19690 }
19691
19692 // If using FIST to compute an unsigned i64, we'll need some fixup
19693 // to handle values above the maximum signed i64. A FIST is always
19694 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
19695 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
19696
19697 // FIXME: This does not generate an invalid exception if the input does not
19698 // fit in i32. PR44019
19699 if (!IsSigned && DstTy != MVT::i64) {
19700 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
19701 // The low 32 bits of the fist result will have the correct uint32 result.
19702 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")((DstTy == MVT::i32 && "Unexpected FP_TO_UINT") ? static_cast
<void> (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19702, __PRETTY_FUNCTION__))
;
19703 DstTy = MVT::i64;
19704 }
19705
19706 assert(DstTy.getSimpleVT() <= MVT::i64 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19708, __PRETTY_FUNCTION__))
19707 DstTy.getSimpleVT() >= MVT::i16 &&((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19708, __PRETTY_FUNCTION__))
19708 "Unknown FP_TO_INT to lower!")((DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT
() >= MVT::i16 && "Unknown FP_TO_INT to lower!") ?
static_cast<void> (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19708, __PRETTY_FUNCTION__))
;
19709
19710 // We lower FP->int64 into FISTP64 followed by a load from a temporary
19711 // stack slot.
19712 MachineFunction &MF = DAG.getMachineFunction();
19713 unsigned MemSize = DstTy.getStoreSize();
19714 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
19715 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19716
19717 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19718
19719 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
19720
19721 if (UnsignedFixup) {
19722 //
19723 // Conversion to unsigned i64 is implemented with a select,
19724 // depending on whether the source value fits in the range
19725 // of a signed i64. Let Thresh be the FP equivalent of
19726 // 0x8000000000000000ULL.
19727 //
19728 // Adjust = (Value < Thresh) ? 0 : 0x80000000;
19729 // FltOfs = (Value < Thresh) ? 0 : 0x80000000;
19730 // FistSrc = (Value - FltOfs);
19731 // Fist-to-mem64 FistSrc
19732 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
19733 // to XOR'ing the high 32 bits with Adjust.
19734 //
19735 // Being a power of 2, Thresh is exactly representable in all FP formats.
19736 // For X87 we'd like to use the smallest FP type for this constant, but
19737 // for DAG type consistency we have to match the FP operand type.
19738
19739 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
19740 LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;
19741 bool LosesInfo = false;
19742 if (TheVT == MVT::f64)
19743 // The rounding mode is irrelevant as the conversion should be exact.
19744 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
19745 &LosesInfo);
19746 else if (TheVT == MVT::f80)
19747 Status = Thresh.convert(APFloat::x87DoubleExtended(),
19748 APFloat::rmNearestTiesToEven, &LosesInfo);
19749
19750 assert(Status == APFloat::opOK && !LosesInfo &&((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"
) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19751, __PRETTY_FUNCTION__))
19751 "FP conversion should have been exact")((Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact"
) ? static_cast<void> (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19751, __PRETTY_FUNCTION__))
;
19752
19753 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
19754
19755 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
19756 *DAG.getContext(), TheVT);
19757 SDValue Cmp;
19758 if (IsStrict) {
19759 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
19760 Chain, /*IsSignaling*/ true);
19761 Chain = Cmp.getValue(1);
19762 } else {
19763 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
19764 }
19765
19766 Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
19767 DAG.getConstant(0, DL, MVT::i64),
19768 DAG.getConstant(APInt::getSignMask(64),
19769 DL, MVT::i64));
19770 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
19771 DAG.getConstantFP(0.0, DL, TheVT),
19772 ThreshVal);
19773
19774 if (IsStrict) {
19775 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
19776 { Chain, Value, FltOfs });
19777 Chain = Value.getValue(1);
19778 } else
19779 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
19780 }
19781
19782 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
19783
19784 // FIXME This causes a redundant load/store if the SSE-class value is already
19785 // in memory, such as if it is on the callstack.
19786 if (isScalarFPTypeInSSEReg(TheVT)) {
19787 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")((DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19787, __PRETTY_FUNCTION__))
;
19788 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
19789 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19790 SDValue Ops[] = { Chain, StackSlot };
19791
19792 unsigned FLDSize = TheVT.getStoreSize();
19793 assert(FLDSize <= MemSize && "Stack slot not big enough")((FLDSize <= MemSize && "Stack slot not big enough"
) ? static_cast<void> (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19793, __PRETTY_FUNCTION__))
;
19794 MachineMemOperand *MMO = MF.getMachineMemOperand(
19795 MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
19796 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
19797 Chain = Value.getValue(1);
19798 }
19799
19800 // Build the FP_TO_INT*_IN_MEM
19801 MachineMemOperand *MMO = MF.getMachineMemOperand(
19802 MPI, MachineMemOperand::MOStore, MemSize, MemSize);
19803 SDValue Ops[] = { Chain, Value, StackSlot };
19804 SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
19805 DAG.getVTList(MVT::Other),
19806 Ops, DstTy, MMO);
19807
19808 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
19809 Chain = Res.getValue(1);
19810
19811 // If we need an unsigned fixup, XOR the result with adjust.
19812 if (UnsignedFixup)
19813 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
19814
19815 return Res;
19816}
19817
19818static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
19819 const X86Subtarget &Subtarget) {
19820 MVT VT = Op.getSimpleValueType();
19821 SDValue In = Op.getOperand(0);
19822 MVT InVT = In.getSimpleValueType();
19823 SDLoc dl(Op);
19824 unsigned Opc = Op.getOpcode();
19825
19826 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((VT.isVector() && InVT.isVector() && "Expected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19826, __PRETTY_FUNCTION__))
;
19827 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode") ? static_cast<void> (0)
: __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19828, __PRETTY_FUNCTION__))
19828 "Unexpected extension opcode")(((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode") ? static_cast<void> (0)
: __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19828, __PRETTY_FUNCTION__))
;
19829 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19830, __PRETTY_FUNCTION__))
19830 "Expected same number of elements")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19830, __PRETTY_FUNCTION__))
;
19831 assert((VT.getVectorElementType() == MVT::i16 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19834, __PRETTY_FUNCTION__))
19832 VT.getVectorElementType() == MVT::i32 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19834, __PRETTY_FUNCTION__))
19833 VT.getVectorElementType() == MVT::i64) &&(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19834, __PRETTY_FUNCTION__))
19834 "Unexpected element type")(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19834, __PRETTY_FUNCTION__))
;
19835 assert((InVT.getVectorElementType() == MVT::i8 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19838, __PRETTY_FUNCTION__))
19836 InVT.getVectorElementType() == MVT::i16 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19838, __PRETTY_FUNCTION__))
19837 InVT.getVectorElementType() == MVT::i32) &&(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19838, __PRETTY_FUNCTION__))
19838 "Unexpected element type")(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19838, __PRETTY_FUNCTION__))
;
19839
19840 unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
19841
19842 // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
19843 if (InVT == MVT::v8i8) {
19844 if (VT != MVT::v8i64)
19845 return SDValue();
19846
19847 In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
19848 MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
19849 return DAG.getNode(ExtendInVecOpc, dl, VT, In);
19850 }
19851
19852 if (Subtarget.hasInt256())
19853 return Op;
19854
19855 // Optimize vectors in AVX mode:
19856 //
19857 // v8i16 -> v8i32
19858 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
19859 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
19860 // Concat upper and lower parts.
19861 //
19862 // v4i32 -> v4i64
19863 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
19864 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
19865 // Concat upper and lower parts.
19866 //
19867 MVT HalfVT = VT.getHalfNumVectorElementsVT();
19868 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
19869
19870 // Short-circuit if we can determine that each 128-bit half is the same value.
19871 // Otherwise, this is difficult to match and optimize.
19872 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
19873 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
19874 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
19875
19876 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
19877 SDValue Undef = DAG.getUNDEF(InVT);
19878 bool NeedZero = Opc == ISD::ZERO_EXTEND;
19879 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
19880 OpHi = DAG.getBitcast(HalfVT, OpHi);
19881
19882 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19883}
19884
19885// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
19886static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
19887 const SDLoc &dl, SelectionDAG &DAG) {
19888 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT."
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19888, __PRETTY_FUNCTION__))
;
19889 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
19890 DAG.getIntPtrConstant(0, dl));
19891 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
19892 DAG.getIntPtrConstant(8, dl));
19893 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
19894 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
19895 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
19896 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
19897}
19898
19899static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
19900 const X86Subtarget &Subtarget,
19901 SelectionDAG &DAG) {
19902 MVT VT = Op->getSimpleValueType(0);
19903 SDValue In = Op->getOperand(0);
19904 MVT InVT = In.getSimpleValueType();
19905 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19905, __PRETTY_FUNCTION__))
;
19906 SDLoc DL(Op);
19907 unsigned NumElts = VT.getVectorNumElements();
19908
19909 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
19910 // avoids a constant pool load.
19911 if (VT.getVectorElementType() != MVT::i8) {
19912 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
19913 return DAG.getNode(ISD::SRL, DL, VT, Extend,
19914 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
19915 }
19916
19917 // Extend VT if BWI is not supported.
19918 MVT ExtVT = VT;
19919 if (!Subtarget.hasBWI()) {
19920 // If v16i32 is to be avoided, we'll need to split and concatenate.
19921 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
19922 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
19923
19924 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
19925 }
19926
19927 // Widen to 512-bits if VLX is not supported.
19928 MVT WideVT = ExtVT;
19929 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
19930 NumElts *= 512 / ExtVT.getSizeInBits();
19931 InVT = MVT::getVectorVT(MVT::i1, NumElts);
19932 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
19933 In, DAG.getIntPtrConstant(0, DL));
19934 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
19935 NumElts);
19936 }
19937
19938 SDValue One = DAG.getConstant(1, DL, WideVT);
19939 SDValue Zero = DAG.getConstant(0, DL, WideVT);
19940
19941 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
19942
19943 // Truncate if we had to extend above.
19944 if (VT != ExtVT) {
19945 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
19946 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
19947 }
19948
19949 // Extract back to 128/256-bit if we widened.
19950 if (WideVT != VT)
19951 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
19952 DAG.getIntPtrConstant(0, DL));
19953
19954 return SelectedVal;
19955}
19956
19957static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19958 SelectionDAG &DAG) {
19959 SDValue In = Op.getOperand(0);
19960 MVT SVT = In.getSimpleValueType();
19961
19962 if (SVT.getVectorElementType() == MVT::i1)
19963 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
19964
19965 assert(Subtarget.hasAVX() && "Expected AVX support")((Subtarget.hasAVX() && "Expected AVX support") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19965, __PRETTY_FUNCTION__))
;
19966 return LowerAVXExtend(Op, DAG, Subtarget);
19967}
19968
19969/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
19970/// It makes use of the fact that vectors with enough leading sign/zero bits
19971/// prevent the PACKSS/PACKUS from saturating the results.
19972/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
19973/// within each 128-bit lane.
19974static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
19975 const SDLoc &DL, SelectionDAG &DAG,
19976 const X86Subtarget &Subtarget) {
19977 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode") ? static_cast<void> (0) : __assert_fail
("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19978, __PRETTY_FUNCTION__))
19978 "Unexpected PACK opcode")(((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
"Unexpected PACK opcode") ? static_cast<void> (0) : __assert_fail
("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19978, __PRETTY_FUNCTION__))
;
19979 assert(DstVT.isVector() && "VT not a vector?")((DstVT.isVector() && "VT not a vector?") ? static_cast
<void> (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19979, __PRETTY_FUNCTION__))
;
19980
19981 // Requires SSE2 but AVX512 has fast vector truncate.
19982 if (!Subtarget.hasSSE2())
19983 return SDValue();
19984
19985 EVT SrcVT = In.getValueType();
19986
19987 // No truncation required, we might get here due to recursive calls.
19988 if (SrcVT == DstVT)
19989 return In;
19990
19991 // We only support vector truncation to 64bits or greater from a
19992 // 128bits or greater source.
19993 unsigned DstSizeInBits = DstVT.getSizeInBits();
19994 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
19995 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
19996 return SDValue();
19997
19998 unsigned NumElems = SrcVT.getVectorNumElements();
19999 if (!isPowerOf2_32(NumElems))
20000 return SDValue();
20001
20002 LLVMContext &Ctx = *DAG.getContext();
20003 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")((DstVT.getVectorNumElements() == NumElems && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20003, __PRETTY_FUNCTION__))
;
20004 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")((SrcSizeInBits > DstSizeInBits && "Illegal truncation"
) ? static_cast<void> (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20004, __PRETTY_FUNCTION__))
;
20005
20006 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20007
20008 // Pack to the largest type possible:
20009 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20010 EVT InVT = MVT::i16, OutVT = MVT::i8;
20011 if (SrcVT.getScalarSizeInBits() > 16 &&
20012 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20013 InVT = MVT::i32;
20014 OutVT = MVT::i16;
20015 }
20016
20017 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
20018 if (SrcVT.is128BitVector()) {
20019 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20020 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20021 In = DAG.getBitcast(InVT, In);
20022 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
20023 Res = extractSubVector(Res, 0, DAG, DL, 64);
20024 return DAG.getBitcast(DstVT, Res);
20025 }
20026
20027 // Extract lower/upper subvectors.
20028 unsigned NumSubElts = NumElems / 2;
20029 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
20030 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
20031
20032 unsigned SubSizeInBits = SrcSizeInBits / 2;
20033 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20034 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20035
20036 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20037 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20038 Lo = DAG.getBitcast(InVT, Lo);
20039 Hi = DAG.getBitcast(InVT, Hi);
20040 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20041 return DAG.getBitcast(DstVT, Res);
20042 }
20043
20044 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20045 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20046 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20047 Lo = DAG.getBitcast(InVT, Lo);
20048 Hi = DAG.getBitcast(InVT, Hi);
20049 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20050
20051 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20052 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20053 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20054 SmallVector<int, 64> Mask;
20055 int Scale = 64 / OutVT.getScalarSizeInBits();
20056 scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
20057 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20058
20059 if (DstVT.is256BitVector())
20060 return DAG.getBitcast(DstVT, Res);
20061
20062 // If 512bit -> 128bit truncate another stage.
20063 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20064 Res = DAG.getBitcast(PackedVT, Res);
20065 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20066 }
20067
20068 // Recursively pack lower/upper subvectors, concat result and pack again.
20069 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")((SrcSizeInBits >= 256 && "Expected 256-bit vector or greater"
) ? static_cast<void> (0) : __assert_fail ("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20069, __PRETTY_FUNCTION__))
;
20070 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
20071 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
20072 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
20073
20074 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20075 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20076 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20077}
20078
20079static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
20080 const X86Subtarget &Subtarget) {
20081
20082 SDLoc DL(Op);
20083 MVT VT = Op.getSimpleValueType();
20084 SDValue In = Op.getOperand(0);
20085 MVT InVT = In.getSimpleValueType();
20086
20087 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")((VT.getVectorElementType() == MVT::i1 && "Unexpected vector type."
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20087, __PRETTY_FUNCTION__))
;
20088
20089 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20090 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20091 if (InVT.getScalarSizeInBits() <= 16) {
20092 if (Subtarget.hasBWI()) {
20093 // legal, will go to VPMOVB2M, VPMOVW2M
20094 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20095 // We need to shift to get the lsb into sign position.
20096 // Shift packed bytes not supported natively, bitcast to word
20097 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20098 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20099 DAG.getBitcast(ExtVT, In),
20100 DAG.getConstant(ShiftInx, DL, ExtVT));
20101 In = DAG.getBitcast(InVT, In);
20102 }
20103 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20104 In, ISD::SETGT);
20105 }
20106 // Use TESTD/Q, extended vector to packed dword/qword.
20107 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.") ? static_cast<void> (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20108, __PRETTY_FUNCTION__))
20108 "Unexpected vector type.")(((InVT.is256BitVector() || InVT.is128BitVector()) &&
"Unexpected vector type.") ? static_cast<void> (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20108, __PRETTY_FUNCTION__))
;
20109 unsigned NumElts = InVT.getVectorNumElements();
20110 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(((NumElts == 8 || NumElts == 16) && "Unexpected number of elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20110, __PRETTY_FUNCTION__))
;
20111 // We need to change to a wider element type that we have support for.
20112 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20113 // For 16 element vectors we extend to v16i32 unless we are explicitly
20114 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20115 // we need to split into two 8 element vectors which we can extend to v8i32,
20116 // truncate and concat the results. There's an additional complication if
20117 // the original type is v16i8. In that case we can't split the v16i8 so
20118 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
20119 // to v8i32, truncate that to v8i1 and concat the two halves.
20120 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20121 if (InVT == MVT::v16i8) {
20122 // First we need to sign extend up to 256-bits so we can split that.
20123 InVT = MVT::v16i16;
20124 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
20125 }
20126 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
20127 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
20128 // We're split now, just emit two truncates and a concat. The two
20129 // truncates will trigger legalization to come back to this function.
20130 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20131 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20132 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20133 }
20134 // We either have 8 elements or we're allowed to use 512-bit vectors.
20135 // If we have VLX, we want to use the narrowest vector that can get the
20136 // job done so we use vXi32.
20137 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20138 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20139 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20140 InVT = ExtVT;
20141 ShiftInx = InVT.getScalarSizeInBits() - 1;
20142 }
20143
20144 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20145 // We need to shift to get the lsb into sign position.
20146 In = DAG.getNode(ISD::SHL, DL, InVT, In,
20147 DAG.getConstant(ShiftInx, DL, InVT));
20148 }
20149 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20150 if (Subtarget.hasDQI())
20151 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20152 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20153}
20154
20155SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20156 SDLoc DL(Op);
20157 MVT VT = Op.getSimpleValueType();
20158 SDValue In = Op.getOperand(0);
20159 MVT InVT = In.getSimpleValueType();
20160 unsigned InNumEltBits = InVT.getScalarSizeInBits();
20161
20162 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20163, __PRETTY_FUNCTION__))
20163 "Invalid TRUNCATE operation")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation") ? static_cast<void> (0) :
__assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20163, __PRETTY_FUNCTION__))
;
20164
20165 // If we're called by the type legalizer, handle a few cases.
20166 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20167 if (!TLI.isTypeLegal(InVT)) {
20168 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20169 VT.is128BitVector()) {
20170 assert(Subtarget.hasVLX() && "Unexpected subtarget!")((Subtarget.hasVLX() && "Unexpected subtarget!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasVLX() && \"Unexpected subtarget!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20170, __PRETTY_FUNCTION__))
;
20171 // The default behavior is to truncate one step, concatenate, and then
20172 // truncate the remainder. We'd rather produce two 64-bit results and
20173 // concatenate those.
20174 SDValue Lo, Hi;
20175 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20176
20177 EVT LoVT, HiVT;
20178 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20179
20180 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20181 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20182 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20183 }
20184
20185 // Otherwise let default legalization handle it.
20186 return SDValue();
20187 }
20188
20189 if (VT.getVectorElementType() == MVT::i1)
20190 return LowerTruncateVecI1(Op, DAG, Subtarget);
20191
20192 // vpmovqb/w/d, vpmovdb/w, vpmovwb
20193 if (Subtarget.hasAVX512()) {
20194 // word to byte only under BWI. Otherwise we have to promoted to v16i32
20195 // and then truncate that. But we should only do that if we haven't been
20196 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20197 // handled by isel patterns.
20198 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20199 Subtarget.canExtendTo512DQ())
20200 return Op;
20201 }
20202
20203 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
20204 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20205
20206 // Truncate with PACKUS if we are truncating a vector with leading zero bits
20207 // that extend all the way to the packed/truncated value.
20208 // Pre-SSE41 we can only use PACKUSWB.
20209 KnownBits Known = DAG.computeKnownBits(In);
20210 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
20211 if (SDValue V =
20212 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
20213 return V;
20214
20215 // Truncate with PACKSS if we are truncating a vector with sign-bits that
20216 // extend all the way to the packed/truncated value.
20217 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
20218 if (SDValue V =
20219 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
20220 return V;
20221
20222 // Handle truncation of V256 to V128 using shuffles.
20223 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")((VT.is128BitVector() && InVT.is256BitVector() &&
"Unexpected types!") ? static_cast<void> (0) : __assert_fail
("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20223, __PRETTY_FUNCTION__))
;
20224
20225 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20226 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20227 if (Subtarget.hasInt256()) {
20228 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20229 In = DAG.getBitcast(MVT::v8i32, In);
20230 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20231 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20232 DAG.getIntPtrConstant(0, DL));
20233 }
20234
20235 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20236 DAG.getIntPtrConstant(0, DL));
20237 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20238 DAG.getIntPtrConstant(2, DL));
20239 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
20240 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
20241 static const int ShufMask[] = {0, 2, 4, 6};
20242 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
20243 }
20244
20245 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20246 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20247 if (Subtarget.hasInt256()) {
20248 In = DAG.getBitcast(MVT::v32i8, In);
20249
20250 // The PSHUFB mask:
20251 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
20252 -1, -1, -1, -1, -1, -1, -1, -1,
20253 16, 17, 20, 21, 24, 25, 28, 29,
20254 -1, -1, -1, -1, -1, -1, -1, -1 };
20255 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
20256 In = DAG.getBitcast(MVT::v4i64, In);
20257
20258 static const int ShufMask2[] = {0, 2, -1, -1};
20259 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
20260 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20261 DAG.getIntPtrConstant(0, DL));
20262 return DAG.getBitcast(VT, In);
20263 }
20264
20265 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
20266 DAG.getIntPtrConstant(0, DL));
20267
20268 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
20269 DAG.getIntPtrConstant(4, DL));
20270
20271 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
20272 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
20273
20274 // The PSHUFB mask:
20275 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
20276 -1, -1, -1, -1, -1, -1, -1, -1};
20277
20278 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
20279 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
20280
20281 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
20282 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
20283
20284 // The MOVLHPS Mask:
20285 static const int ShufMask2[] = {0, 1, 4, 5};
20286 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
20287 return DAG.getBitcast(MVT::v8i16, res);
20288 }
20289
20290 if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
20291 // Use an AND to zero uppper bits for PACKUS.
20292 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
20293
20294 SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
20295 DAG.getIntPtrConstant(0, DL));
20296 SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
20297 DAG.getIntPtrConstant(8, DL));
20298 return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
20299 }
20300
20301 llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20301)
;
20302}
20303
20304SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
20305 bool IsStrict = Op->isStrictFPOpcode();
20306 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
20307 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
20308 MVT VT = Op->getSimpleValueType(0);
20309 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20310 MVT SrcVT = Src.getSimpleValueType();
20311 SDLoc dl(Op);
20312
20313 if (VT.isVector()) {
20314 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
20315 MVT ResVT = MVT::v4i32;
20316 MVT TruncVT = MVT::v4i1;
20317 unsigned Opc;
20318 if (IsStrict)
20319 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
20320 else
20321 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20322
20323 if (!IsSigned && !Subtarget.hasVLX()) {
20324 assert(Subtarget.useAVX512Regs() && "Unexpected features!")((Subtarget.useAVX512Regs() && "Unexpected features!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20324, __PRETTY_FUNCTION__))
;
20325 // Widen to 512-bits.
20326 ResVT = MVT::v8i32;
20327 TruncVT = MVT::v8i1;
20328 Opc = Op.getOpcode();
20329 // Need to concat with zero vector for strict fp to avoid spurious
20330 // exceptions.
20331 // TODO: Should we just do this for non-strict as well?
20332 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
20333 : DAG.getUNDEF(MVT::v8f64);
20334 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
20335 DAG.getIntPtrConstant(0, dl));
20336 }
20337 SDValue Res, Chain;
20338 if (IsStrict) {
20339 Res =
20340 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
20341 Chain = Res.getValue(1);
20342 } else {
20343 Res = DAG.getNode(Opc, dl, ResVT, Src);
20344 }
20345
20346 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
20347 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
20348 DAG.getIntPtrConstant(0, dl));
20349 if (IsStrict)
20350 return DAG.getMergeValues({Res, Chain}, dl);
20351 return Res;
20352 }
20353
20354 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
20355 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
20356 assert(!IsSigned && "Expected unsigned conversion!")((!IsSigned && "Expected unsigned conversion!") ? static_cast
<void> (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20356, __PRETTY_FUNCTION__))
;
20357 assert(Subtarget.useAVX512Regs() && "Requires avx512f")((Subtarget.useAVX512Regs() && "Requires avx512f") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20357, __PRETTY_FUNCTION__))
;
20358 return Op;
20359 }
20360
20361 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
20362 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
20363 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
20364 assert(!IsSigned && "Expected unsigned conversion!")((!IsSigned && "Expected unsigned conversion!") ? static_cast
<void> (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20364, __PRETTY_FUNCTION__))
;
20365 assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&((Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
"Unexpected features!") ? static_cast<void> (0) : __assert_fail
("Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20366, __PRETTY_FUNCTION__))
20366 "Unexpected features!")((Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
"Unexpected features!") ? static_cast<void> (0) : __assert_fail
("Subtarget.useAVX512Regs() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20366, __PRETTY_FUNCTION__))
;
20367 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20368 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20369 // Need to concat with zero vector for strict fp to avoid spurious
20370 // exceptions.
20371 // TODO: Should we just do this for non-strict as well?
20372 SDValue Tmp =
20373 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20374 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20375 DAG.getIntPtrConstant(0, dl));
20376
20377 SDValue Res, Chain;
20378 if (IsStrict) {
20379 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
20380 {Op->getOperand(0), Src});
20381 Chain = Res.getValue(1);
20382 } else {
20383 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
20384 }
20385
20386 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20387 DAG.getIntPtrConstant(0, dl));
20388
20389 if (IsStrict)
20390 return DAG.getMergeValues({Res, Chain}, dl);
20391 return Res;
20392 }
20393
20394 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
20395 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
20396 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
20397 assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&((Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
!Subtarget.hasVLX() && "Unexpected features!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && Subtarget.hasDQI() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20398, __PRETTY_FUNCTION__))
20398 !Subtarget.hasVLX() && "Unexpected features!")((Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
!Subtarget.hasVLX() && "Unexpected features!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.useAVX512Regs() && Subtarget.hasDQI() && !Subtarget.hasVLX() && \"Unexpected features!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20398, __PRETTY_FUNCTION__))
;
20399 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20400 // Need to concat with zero vector for strict fp to avoid spurious
20401 // exceptions.
20402 // TODO: Should we just do this for non-strict as well?
20403 SDValue Tmp =
20404 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20405 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20406 DAG.getIntPtrConstant(0, dl));
20407
20408 SDValue Res, Chain;
20409 if (IsStrict) {
20410 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
20411 {Op->getOperand(0), Src});
20412 Chain = Res.getValue(1);
20413 } else {
20414 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
20415 }
20416
20417 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20418 DAG.getIntPtrConstant(0, dl));
20419
20420 if (IsStrict)
20421 return DAG.getMergeValues({Res, Chain}, dl);
20422 return Res;
20423 }
20424
20425 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
20426 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")((Subtarget.hasDQI() && Subtarget.hasVLX() &&
"Requires AVX512DQVL") ? static_cast<void> (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20426, __PRETTY_FUNCTION__))
;
20427 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
20428 DAG.getUNDEF(MVT::v2f32));
20429 if (IsStrict) {
20430 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
20431 : X86ISD::STRICT_CVTTP2UI;
20432 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
20433 }
20434 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20435 return DAG.getNode(Opc, dl, VT, Tmp);
20436 }
20437
20438 return SDValue();
20439 }
20440
20441 assert(!VT.isVector())((!VT.isVector()) ? static_cast<void> (0) : __assert_fail
("!VT.isVector()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20441, __PRETTY_FUNCTION__))
;
20442
20443 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
20444
20445 if (!IsSigned && UseSSEReg) {
20446 // Conversions from f32/f64 with AVX512 should be legal.
20447 if (Subtarget.hasAVX512())
20448 return Op;
20449
20450 // Use default expansion for i64.
20451 if (VT == MVT::i64)
20452 return SDValue();
20453
20454 assert(VT == MVT::i32 && "Unexpected VT!")((VT == MVT::i32 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20454, __PRETTY_FUNCTION__))
;
20455
20456 // Promote i32 to i64 and use a signed operation on 64-bit targets.
20457 // FIXME: This does not generate an invalid exception if the input does not
20458 // fit in i32. PR44019
20459 if (Subtarget.is64Bit()) {
20460 SDValue Res, Chain;
20461 if (IsStrict) {
20462 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
20463 { Op.getOperand(0), Src });
20464 Chain = Res.getValue(1);
20465 } else
20466 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
20467
20468 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20469 if (IsStrict)
20470 return DAG.getMergeValues({ Res, Chain }, dl);
20471 return Res;
20472 }
20473
20474 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
20475 // use fisttp which will be handled later.
20476 if (!Subtarget.hasSSE3())
20477 return SDValue();
20478 }
20479
20480 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
20481 // FIXME: This does not generate an invalid exception if the input does not
20482 // fit in i16. PR44019
20483 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
20484 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")((IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? static_cast<void> (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20484, __PRETTY_FUNCTION__))
;
20485 SDValue Res, Chain;
20486 if (IsStrict) {
20487 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
20488 { Op.getOperand(0), Src });
20489 Chain = Res.getValue(1);
20490 } else
20491 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
20492
20493 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20494 if (IsStrict)
20495 return DAG.getMergeValues({ Res, Chain }, dl);
20496 return Res;
20497 }
20498
20499 // If this is a FP_TO_SINT using SSEReg we're done.
20500 if (UseSSEReg && IsSigned)
20501 return Op;
20502
20503 // fp128 needs to use a libcall.
20504 if (SrcVT == MVT::f128) {
20505 RTLIB::Libcall LC;
20506 if (IsSigned)
20507 LC = RTLIB::getFPTOSINT(SrcVT, VT);
20508 else
20509 LC = RTLIB::getFPTOUINT(SrcVT, VT);
20510
20511 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20512 MakeLibCallOptions CallOptions;
20513 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
20514 SDLoc(Op), Chain);
20515
20516 if (IsStrict)
20517 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
20518
20519 return Tmp.first;
20520 }
20521
20522 // Fall back to X87.
20523 SDValue Chain;
20524 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
20525 if (IsStrict)
20526 return DAG.getMergeValues({V, Chain}, dl);
20527 return V;
20528 }
20529
20530 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20530)
;
20531}
20532
20533SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
20534 SelectionDAG &DAG) const {
20535 SDValue Src = Op.getOperand(0);
20536 MVT SrcVT = Src.getSimpleValueType();
20537
20538 // If the source is in an SSE register, the node is Legal.
20539 if (isScalarFPTypeInSSEReg(SrcVT))
20540 return Op;
20541
20542 return LRINT_LLRINTHelper(Op.getNode(), DAG);
20543}
20544
20545SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
20546 SelectionDAG &DAG) const {
20547 EVT DstVT = N->getValueType(0);
20548 SDValue Src = N->getOperand(0);
20549 EVT SrcVT = Src.getValueType();
20550
20551 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
20552 // f16 must be promoted before using the lowering in this routine.
20553 // fp128 does not use this lowering.
20554 return SDValue();
20555 }
20556
20557 SDLoc DL(N);
20558 SDValue Chain = DAG.getEntryNode();
20559
20560 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
20561
20562 // If we're converting from SSE, the stack slot needs to hold both types.
20563 // Otherwise it only needs to hold the DstVT.
20564 EVT OtherVT = UseSSE ? SrcVT : DstVT;
20565 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
20566 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
20567 MachinePointerInfo MPI =
20568 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
20569
20570 if (UseSSE) {
20571 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")((DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? static_cast<void> (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20571, __PRETTY_FUNCTION__))
;
20572 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
20573 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20574 SDValue Ops[] = { Chain, StackPtr };
20575
20576 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
20577 /*Align*/0, MachineMemOperand::MOLoad);
20578 Chain = Src.getValue(1);
20579 }
20580
20581 SDValue StoreOps[] = { Chain, Src, StackPtr };
20582 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL,
20583 DAG.getVTList(MVT::Other), StoreOps,
20584 DstVT, MPI, /*Align*/0,
20585 MachineMemOperand::MOStore);
20586
20587 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
20588}
20589
20590SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20591 bool IsStrict = Op->isStrictFPOpcode();
20592
20593 SDLoc DL(Op);
20594 MVT VT = Op.getSimpleValueType();
20595 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
20596 MVT SVT = In.getSimpleValueType();
20597
20598 if (VT == MVT::f128) {
20599 RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
20600 return LowerF128Call(Op, DAG, LC);
20601 }
20602
20603 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")((SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? static_cast<void> (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20603, __PRETTY_FUNCTION__))
;
20604
20605 SDValue Res =
20606 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
20607 if (IsStrict)
20608 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
20609 {Op->getOperand(0), Res});
20610 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
20611}
20612
20613SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20614 bool IsStrict = Op->isStrictFPOpcode();
20615
20616 MVT VT = Op.getSimpleValueType();
20617 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
20618 MVT SVT = In.getSimpleValueType();
20619
20620 // It's legal except when f128 is involved
20621 if (SVT != MVT::f128)
20622 return Op;
20623
20624 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);
20625
20626 // FP_ROUND node has a second operand indicating whether it is known to be
20627 // precise. That doesn't take part in the LibCall so we can't directly use
20628 // LowerF128Call.
20629
20630 SDLoc dl(Op);
20631 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20632 MakeLibCallOptions CallOptions;
20633 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
20634 dl, Chain);
20635
20636 if (IsStrict)
20637 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
20638
20639 return Tmp.first;
20640}
20641
20642static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
20643 bool IsStrict = Op->isStrictFPOpcode();
20644 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20645 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&((Src.getValueType() == MVT::i16 && Op.getValueType()
== MVT::f32 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20646, __PRETTY_FUNCTION__))
20646 "Unexpected VT!")((Src.getValueType() == MVT::i16 && Op.getValueType()
== MVT::f32 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20646, __PRETTY_FUNCTION__))
;
20647
20648 SDLoc dl(Op);
20649 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
20650 DAG.getConstant(0, dl, MVT::v8i16), Src,
20651 DAG.getIntPtrConstant(0, dl));
20652
20653 SDValue Chain;
20654 if (IsStrict) {
20655 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
20656 {Op.getOperand(0), Res});
20657 Chain = Res.getValue(1);
20658 } else {
20659 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
20660 }
20661
20662 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
20663 DAG.getIntPtrConstant(0, dl));
20664
20665 if (IsStrict)
20666 return DAG.getMergeValues({Res, Chain}, dl);
20667
20668 return Res;
20669}
20670
20671static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
20672 bool IsStrict = Op->isStrictFPOpcode();
20673 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20674 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&((Src.getValueType() == MVT::f32 && Op.getValueType()
== MVT::i16 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20675, __PRETTY_FUNCTION__))
20675 "Unexpected VT!")((Src.getValueType() == MVT::f32 && Op.getValueType()
== MVT::i16 && "Unexpected VT!") ? static_cast<void
> (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20675, __PRETTY_FUNCTION__))
;
20676
20677 SDLoc dl(Op);
20678 SDValue Res, Chain;
20679 if (IsStrict) {
20680 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
20681 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
20682 DAG.getIntPtrConstant(0, dl));
20683 Res = DAG.getNode(
20684 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
20685 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
20686 Chain = Res.getValue(1);
20687 } else {
20688 // FIXME: Should we use zeros for upper elements for non-strict?
20689 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
20690 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
20691 DAG.getTargetConstant(4, dl, MVT::i32));
20692 }
20693
20694 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
20695 DAG.getIntPtrConstant(0, dl));
20696
20697 if (IsStrict)
20698 return DAG.getMergeValues({Res, Chain}, dl);
20699
20700 return Res;
20701}
20702
20703/// Depending on uarch and/or optimizing for size, we might prefer to use a
20704/// vector operation in place of the typical scalar operation.
20705static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
20706 const X86Subtarget &Subtarget) {
20707 // If both operands have other uses, this is probably not profitable.
20708 SDValue LHS = Op.getOperand(0);
20709 SDValue RHS = Op.getOperand(1);
20710 if (!LHS.hasOneUse() && !RHS.hasOneUse())
20711 return Op;
20712
20713 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
20714 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
20715 if (IsFP && !Subtarget.hasSSE3())
20716 return Op;
20717 if (!IsFP && !Subtarget.hasSSSE3())
20718 return Op;
20719
20720 // Extract from a common vector.
20721 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20722 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20723 LHS.getOperand(0) != RHS.getOperand(0) ||
20724 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
20725 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
20726 !shouldUseHorizontalOp(true, DAG, Subtarget))
20727 return Op;
20728
20729 // Allow commuted 'hadd' ops.
20730 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
20731 unsigned HOpcode;
20732 switch (Op.getOpcode()) {
20733 case ISD::ADD: HOpcode = X86ISD::HADD; break;
20734 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
20735 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
20736 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
20737 default:
20738 llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20738)
;
20739 }
20740 unsigned LExtIndex = LHS.getConstantOperandVal(1);
20741 unsigned RExtIndex = RHS.getConstantOperandVal(1);
20742 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
20743 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
20744 std::swap(LExtIndex, RExtIndex);
20745
20746 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
20747 return Op;
20748
20749 SDValue X = LHS.getOperand(0);
20750 EVT VecVT = X.getValueType();
20751 unsigned BitWidth = VecVT.getSizeInBits();
20752 unsigned NumLanes = BitWidth / 128;
20753 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
20754 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here") ? static_cast<
void> (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20755, __PRETTY_FUNCTION__))
20755 "Not expecting illegal vector widths here")(((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
"Not expecting illegal vector widths here") ? static_cast<
void> (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20755, __PRETTY_FUNCTION__))
;
20756
20757 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
20758 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
20759 SDLoc DL(Op);
20760 if (BitWidth == 256 || BitWidth == 512) {
20761 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
20762 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
20763 LExtIndex %= NumEltsPerLane;
20764 }
20765
20766 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
20767 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
20768 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
20769 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
20770 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
20771 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
20772 DAG.getIntPtrConstant(LExtIndex / 2, DL));
20773}
20774
20775/// Depending on uarch and/or optimizing for size, we might prefer to use a
20776/// vector operation in place of the typical scalar operation.
20777SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
20778 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::
f64) && "Only expecting float/double") ? static_cast<
void> (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20779, __PRETTY_FUNCTION__))
20779 "Only expecting float/double")(((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::
f64) && "Only expecting float/double") ? static_cast<
void> (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20779, __PRETTY_FUNCTION__))
;
20780 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
20781}
20782
20783/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
20784/// This mode isn't supported in hardware on X86. But as long as we aren't
20785/// compiling with trapping math, we can emulate this with
20786/// floor(X + copysign(nextafter(0.5, 0.0), X)).
20787static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
20788 SDValue N0 = Op.getOperand(0);
20789 SDLoc dl(Op);
20790 MVT VT = Op.getSimpleValueType();
20791
20792 // N0 += copysign(nextafter(0.5, 0.0), N0)
20793 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
20794 bool Ignored;
20795 APFloat Point5Pred = APFloat(0.5f);
20796 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
20797 Point5Pred.next(/*nextDown*/true);
20798
20799 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
20800 DAG.getConstantFP(Point5Pred, dl, VT), N0);
20801 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
20802
20803 // Truncate the result to remove fraction.
20804 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
20805}
20806
20807/// The only differences between FABS and FNEG are the mask and the logic op.
20808/// FNEG also has a folding opportunity for FNEG(FABS(x)).
20809static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
20810 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20811, __PRETTY_FUNCTION__))
20811 "Wrong opcode for lowering FABS or FNEG.")(((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG
) && "Wrong opcode for lowering FABS or FNEG.") ? static_cast
<void> (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20811, __PRETTY_FUNCTION__))
;
20812
20813 bool IsFABS = (Op.getOpcode() == ISD::FABS);
20814
20815 // If this is a FABS and it has an FNEG user, bail out to fold the combination
20816 // into an FNABS. We'll lower the FABS after that if it is still in use.
20817 if (IsFABS)
20818 for (SDNode *User : Op->uses())
20819 if (User->getOpcode() == ISD::FNEG)
20820 return Op;
20821
20822 SDLoc dl(Op);
20823 MVT VT = Op.getSimpleValueType();
20824
20825 bool IsF128 = (VT == MVT::f128);
20826 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20829, __PRETTY_FUNCTION__))
20827 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20829, __PRETTY_FUNCTION__))
20828 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20829, __PRETTY_FUNCTION__))
20829 "Unexpected type in LowerFABSorFNEG")(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFABSorFNEG"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFABSorFNEG\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20829, __PRETTY_FUNCTION__))
;
20830
20831 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
20832 // decide if we should generate a 16-byte constant mask when we only need 4 or
20833 // 8 bytes for the scalar case.
20834
20835 // There are no scalar bitwise logical SSE/AVX instructions, so we
20836 // generate a 16-byte vector constant and logic op even for the scalar case.
20837 // Using a 16-byte mask allows folding the load of the mask with
20838 // the logic op, so it can save (~4 bytes) on code size.
20839 bool IsFakeVector = !VT.isVector() && !IsF128;
20840 MVT LogicVT = VT;
20841 if (IsFakeVector)
20842 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
20843
20844 unsigned EltBits = VT.getScalarSizeInBits();
20845 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
20846 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
20847 APInt::getSignMask(EltBits);
20848 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
20849 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
20850
20851 SDValue Op0 = Op.getOperand(0);
20852 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
20853 unsigned LogicOp = IsFABS ? X86ISD::FAND :
20854 IsFNABS ? X86ISD::FOR :
20855 X86ISD::FXOR;
20856 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
20857
20858 if (VT.isVector() || IsF128)
20859 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
20860
20861 // For the scalar case extend to a 128-bit vector, perform the logic op,
20862 // and extract the scalar result back out.
20863 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
20864 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
20865 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
20866 DAG.getIntPtrConstant(0, dl));
20867}
20868
20869static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
20870 SDValue Mag = Op.getOperand(0);
20871 SDValue Sign = Op.getOperand(1);
20872 SDLoc dl(Op);
20873
20874 // If the sign operand is smaller, extend it first.
20875 MVT VT = Op.getSimpleValueType();
20876 if (Sign.getSimpleValueType().bitsLT(VT))
20877 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
20878
20879 // And if it is bigger, shrink it first.
20880 if (Sign.getSimpleValueType().bitsGT(VT))
20881 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
20882
20883 // At this point the operands and the result should have the same
20884 // type, and that won't be f80 since that is not custom lowered.
20885 bool IsF128 = (VT == MVT::f128);
20886 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20889, __PRETTY_FUNCTION__))
20887 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20889, __PRETTY_FUNCTION__))
20888 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20889, __PRETTY_FUNCTION__))
20889 "Unexpected type in LowerFCOPYSIGN")(((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT ==
MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT
::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && "Unexpected type in LowerFCOPYSIGN"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && \"Unexpected type in LowerFCOPYSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20889, __PRETTY_FUNCTION__))
;
20890
20891 const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
20892
20893 // Perform all scalar logic operations as 16-byte vectors because there are no
20894 // scalar FP logic instructions in SSE.
20895 // TODO: This isn't necessary. If we used scalar types, we might avoid some
20896 // unnecessary splats, but we might miss load folding opportunities. Should
20897 // this decision be based on OptimizeForSize?
20898 bool IsFakeVector = !VT.isVector() && !IsF128;
20899 MVT LogicVT = VT;
20900 if (IsFakeVector)
20901 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
20902
20903 // The mask constants are automatically splatted for vector types.
20904 unsigned EltSizeInBits = VT.getScalarSizeInBits();
20905 SDValue SignMask = DAG.getConstantFP(
20906 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
20907 SDValue MagMask = DAG.getConstantFP(
20908 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
20909
20910 // First, clear all bits but the sign bit from the second operand (sign).
20911 if (IsFakeVector)
20912 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
20913 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
20914
20915 // Next, clear the sign bit from the first operand (magnitude).
20916 // TODO: If we had general constant folding for FP logic ops, this check
20917 // wouldn't be necessary.
20918 SDValue MagBits;
20919 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
20920 APFloat APF = Op0CN->getValueAPF();
20921 APF.clearSign();
20922 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
20923 } else {
20924 // If the magnitude operand wasn't a constant, we need to AND out the sign.
20925 if (IsFakeVector)
20926 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
20927 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
20928 }
20929
20930 // OR the magnitude value with the sign bit.
20931 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
20932 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
20933 DAG.getIntPtrConstant(0, dl));
20934}
20935
20936static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
20937 SDValue N0 = Op.getOperand(0);
20938 SDLoc dl(Op);
20939 MVT VT = Op.getSimpleValueType();
20940
20941 MVT OpVT = N0.getSimpleValueType();
20942 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"
) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20943, __PRETTY_FUNCTION__))
20943 "Unexpected type for FGETSIGN")(((OpVT == MVT::f32 || OpVT == MVT::f64) && "Unexpected type for FGETSIGN"
) ? static_cast<void> (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20943, __PRETTY_FUNCTION__))
;
20944
20945 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
20946 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
20947 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
20948 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
20949 Res = DAG.getZExtOrTrunc(Res, dl, VT);
20950 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
20951 return Res;
20952}
20953
20954/// Helper for creating a X86ISD::SETCC node.
20955static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
20956 SelectionDAG &DAG) {
20957 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
20958 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
20959}
20960
20961/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
20962/// style scalarized (associative) reduction patterns.
20963static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
20964 SmallVectorImpl<SDValue> &SrcOps) {
20965 SmallVector<SDValue, 8> Opnds;
20966 DenseMap<SDValue, APInt> SrcOpMap;
20967 EVT VT = MVT::Other;
20968
20969 // Recognize a special case where a vector is casted into wide integer to
20970 // test all 0s.
20971 assert(Op.getOpcode() == unsigned(BinOp) &&((Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20972, __PRETTY_FUNCTION__))
20972 "Unexpected bit reduction opcode")((Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode"
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20972, __PRETTY_FUNCTION__))
;
20973 Opnds.push_back(Op.getOperand(0));
20974 Opnds.push_back(Op.getOperand(1));
20975
20976 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
20977 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
20978 // BFS traverse all BinOp operands.
20979 if (I->getOpcode() == unsigned(BinOp)) {
20980 Opnds.push_back(I->getOperand(0));
20981 Opnds.push_back(I->getOperand(1));
20982 // Re-evaluate the number of nodes to be traversed.
20983 e += 2; // 2 more nodes (LHS and RHS) are pushed.
20984 continue;
20985 }
20986
20987 // Quit if a non-EXTRACT_VECTOR_ELT
20988 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
20989 return false;
20990
20991 // Quit if without a constant index.
20992 SDValue Idx = I->getOperand(1);
20993 if (!isa<ConstantSDNode>(Idx))
20994 return false;
20995
20996 SDValue Src = I->getOperand(0);
20997 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
20998 if (M == SrcOpMap.end()) {
20999 VT = Src.getValueType();
21000 // Quit if not the same type.
21001 if (SrcOpMap.begin() != SrcOpMap.end() &&
21002 VT != SrcOpMap.begin()->first.getValueType())
21003 return false;
21004 unsigned NumElts = VT.getVectorNumElements();
21005 APInt EltCount = APInt::getNullValue(NumElts);
21006 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
21007 SrcOps.push_back(Src);
21008 }
21009 // Quit if element already used.
21010 unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
21011 if (M->second[CIdx])
21012 return false;
21013 M->second.setBit(CIdx);
21014 }
21015
21016 // Quit if not all elements are used.
21017 for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
21018 E = SrcOpMap.end();
21019 I != E; ++I) {
21020 if (!I->second.isAllOnesValue())
21021 return false;
21022 }
21023
21024 return true;
21025}
21026
21027// Check whether an OR'd tree is PTEST-able.
21028static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
21029 const X86Subtarget &Subtarget,
21030 SelectionDAG &DAG, SDValue &X86CC) {
21031 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.")((Op.getOpcode() == ISD::OR && "Only check OR'd tree."
) ? static_cast<void> (0) : __assert_fail ("Op.getOpcode() == ISD::OR && \"Only check OR'd tree.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21031, __PRETTY_FUNCTION__))
;
21032
21033 if (!Subtarget.hasSSE41() || !Op->hasOneUse())
21034 return SDValue();
21035
21036 SmallVector<SDValue, 8> VecIns;
21037 if (!matchScalarReduction(Op, ISD::OR, VecIns))
21038 return SDValue();
21039
21040 // Quit if not 128/256-bit vector.
21041 EVT VT = VecIns[0].getValueType();
21042 if (!VT.is128BitVector() && !VT.is256BitVector())
21043 return SDValue();
21044
21045 SDLoc DL(Op);
21046 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
21047
21048 // Cast all vectors into TestVT for PTEST.
21049 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
21050 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
21051
21052 // If more than one full vector is evaluated, OR them first before PTEST.
21053 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
21054 // Each iteration will OR 2 nodes and append the result until there is only
21055 // 1 node left, i.e. the final OR'd value of all vectors.
21056 SDValue LHS = VecIns[Slot];
21057 SDValue RHS = VecIns[Slot + 1];
21058 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
21059 }
21060
21061 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
21062 DL, MVT::i8);
21063 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
21064}
21065
21066/// return true if \c Op has a use that doesn't just read flags.
21067static bool hasNonFlagsUse(SDValue Op) {
21068 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
21069 ++UI) {
21070 SDNode *User = *UI;
21071 unsigned UOpNo = UI.getOperandNo();
21072 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
21073 // Look pass truncate.
21074 UOpNo = User->use_begin().getOperandNo();
21075 User = *User->use_begin();
21076 }
21077
21078 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
21079 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
21080 return true;
21081 }
21082 return false;
21083}
21084
21085// Transform to an x86-specific ALU node with flags if there is a chance of
21086// using an RMW op or only the flags are used. Otherwise, leave
21087// the node alone and emit a 'cmp' or 'test' instruction.
21088static bool isProfitableToUseFlagOp(SDValue Op) {
21089 for (SDNode *U : Op->uses())
21090 if (U->getOpcode() != ISD::CopyToReg &&
21091 U->getOpcode() != ISD::SETCC &&
21092 U->getOpcode() != ISD::STORE)
21093 return false;
21094
21095 return true;
21096}
21097
21098/// Emit nodes that will be selected as "test Op0,Op0", or something
21099/// equivalent.
21100static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
21101 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
21102 // CF and OF aren't always set the way we want. Determine which
21103 // of these we need.
21104 bool NeedCF = false;
21105 bool NeedOF = false;
21106 switch (X86CC) {
21107 default: break;
21108 case X86::COND_A: case X86::COND_AE:
21109 case X86::COND_B: case X86::COND_BE:
21110 NeedCF = true;
21111 break;
21112 case X86::COND_G: case X86::COND_GE:
21113 case X86::COND_L: case X86::COND_LE:
21114 case X86::COND_O: case X86::COND_NO: {
21115 // Check if we really need to set the
21116 // Overflow flag. If NoSignedWrap is present
21117 // that is not actually needed.
21118 switch (Op->getOpcode()) {
21119 case ISD::ADD:
21120 case ISD::SUB:
21121 case ISD::MUL:
21122 case ISD::SHL:
21123 if (Op.getNode()->getFlags().hasNoSignedWrap())
21124 break;
21125 LLVM_FALLTHROUGH[[gnu::fallthrough]];
21126 default:
21127 NeedOF = true;
21128 break;
21129 }
21130 break;
21131 }
21132 }
21133 // See if we can use the EFLAGS value from the operand instead of
21134 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
21135 // we prove that the arithmetic won't overflow, we can't use OF or CF.
21136 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
21137 // Emit a CMP with 0, which is the TEST pattern.
21138 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
21139 DAG.getConstant(0, dl, Op.getValueType()));
21140 }
21141 unsigned Opcode = 0;
21142 unsigned NumOperands = 0;
21143
21144 SDValue ArithOp = Op;
21145
21146 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
21147 // which may be the result of a CAST. We use the variable 'Op', which is the
21148 // non-casted variable when we check for possible users.
21149 switch (ArithOp.getOpcode()) {
21150 case ISD::AND:
21151 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
21152 // because a TEST instruction will be better.
21153 if (!hasNonFlagsUse(Op))
21154 break;
21155
21156 LLVM_FALLTHROUGH[[gnu::fallthrough]];
21157 case ISD::ADD:
21158 case ISD::SUB:
21159 case ISD::OR:
21160 case ISD::XOR:
21161 if (!isProfitableToUseFlagOp(Op))
21162 break;
21163
21164 // Otherwise use a regular EFLAGS-setting instruction.
21165 switch (ArithOp.getOpcode()) {
21166 default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21166)
;
21167 case ISD::ADD: Opcode = X86ISD::ADD; break;
21168 case ISD::SUB: Opcode = X86ISD::SUB; break;
21169 case ISD::XOR: Opcode = X86ISD::XOR; break;
21170 case ISD::AND: Opcode = X86ISD::AND; break;
21171 case ISD::OR: Opcode = X86ISD::OR; break;
21172 }
21173
21174 NumOperands = 2;
21175 break;
21176 case X86ISD::ADD:
21177 case X86ISD::SUB:
21178 case X86ISD::OR:
21179 case X86ISD::XOR:
21180 case X86ISD::AND:
21181 return SDValue(Op.getNode(), 1);
21182 case ISD::SSUBO:
21183 case ISD::USUBO: {
21184 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
21185 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
21186 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
21187 Op->getOperand(1)).getValue(1);
21188 }
21189 default:
21190 break;
21191 }
21192
21193 if (Opcode == 0) {
21194 // Emit a CMP with 0, which is the TEST pattern.
21195 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
21196 DAG.getConstant(0, dl, Op.getValueType()));
21197 }
21198 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
21199 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
21200
21201 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
21202 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
21203 return SDValue(New.getNode(), 1);
21204}
21205
21206/// Emit nodes that will be selected as "cmp Op0,Op1", or something
21207/// equivalent.
21208static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
21209 const SDLoc &dl, SelectionDAG &DAG,
21210 const X86Subtarget &Subtarget) {
21211 if (isNullConstant(Op1))
21212 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
21213
21214 EVT CmpVT = Op0.getValueType();
21215
21216 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32
|| CmpVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21217, __PRETTY_FUNCTION__))
21217 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32
|| CmpVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21217, __PRETTY_FUNCTION__))
;
21218
21219 // Only promote the compare up to I32 if it is a 16 bit operation
21220 // with an immediate. 16 bit immediates are to be avoided.
21221 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
21222 !DAG.getMachineFunction().getFunction().hasMinSize()) {
21223 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
21224 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
21225 // Don't do this if the immediate can fit in 8-bits.
21226 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
21227 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
21228 unsigned ExtendOp =
21229 isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21230 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
21231 // For equality comparisons try to use SIGN_EXTEND if the input was
21232 // truncate from something with enough sign bits.
21233 if (Op0.getOpcode() == ISD::TRUNCATE) {
21234 SDValue In = Op0.getOperand(0);
21235 unsigned EffBits =
21236 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
21237 if (EffBits <= 16)
21238 ExtendOp = ISD::SIGN_EXTEND;
21239 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
21240 SDValue In = Op1.getOperand(0);
21241 unsigned EffBits =
21242 In.getScalarValueSizeInBits() - DAG.ComputeNumSignBits(In) + 1;
21243 if (EffBits <= 16)
21244 ExtendOp = ISD::SIGN_EXTEND;
21245 }
21246 }
21247
21248 CmpVT = MVT::i32;
21249 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
21250 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
21251 }
21252 }
21253
21254 // Try to shrink i64 compares if the input has enough zero bits.
21255 // FIXME: Do this for non-constant compares for constant on LHS?
21256 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
21257 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
21258 cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
21259 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
21260 CmpVT = MVT::i32;
21261 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
21262 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
21263 }
21264
21265 // 0-x == y --> x+y == 0
21266 // 0-x != y --> x+y != 0
21267 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
21268 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
21269 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
21270 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
21271 return Add.getValue(1);
21272 }
21273
21274 // x == 0-y --> x+y == 0
21275 // x != 0-y --> x+y != 0
21276 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
21277 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
21278 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
21279 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
21280 return Add.getValue(1);
21281 }
21282
21283 // Use SUB instead of CMP to enable CSE between SUB and CMP.
21284 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
21285 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
21286 return Sub.getValue(1);
21287}
21288
21289/// Check if replacement of SQRT with RSQRT should be disabled.
21290bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
21291 EVT VT = Op.getValueType();
21292
21293 // We never want to use both SQRT and RSQRT instructions for the same input.
21294 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
21295 return false;
21296
21297 if (VT.isVector())
21298 return Subtarget.hasFastVectorFSQRT();
21299 return Subtarget.hasFastScalarFSQRT();
21300}
21301
21302/// The minimum architected relative accuracy is 2^-12. We need one
21303/// Newton-Raphson step to have a good float result (24 bits of precision).
21304SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
21305 SelectionDAG &DAG, int Enabled,
21306 int &RefinementSteps,
21307 bool &UseOneConstNR,
21308 bool Reciprocal) const {
21309 EVT VT = Op.getValueType();
21310
21311 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
21312 // It is likely not profitable to do this for f64 because a double-precision
21313 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
21314 // instructions: convert to single, rsqrtss, convert back to double, refine
21315 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
21316 // along with FMA, this could be a throughput win.
21317 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
21318 // after legalize types.
21319 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
21320 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
21321 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
21322 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
21323 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
21324 if (RefinementSteps == ReciprocalEstimate::Unspecified)
21325 RefinementSteps = 1;
21326
21327 UseOneConstNR = false;
21328 // There is no FSQRT for 512-bits, but there is RSQRT14.
21329 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
21330 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
21331 }
21332 return SDValue();
21333}
21334
21335/// The minimum architected relative accuracy is 2^-12. We need one
21336/// Newton-Raphson step to have a good float result (24 bits of precision).
21337SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
21338 int Enabled,
21339 int &RefinementSteps) const {
21340 EVT VT = Op.getValueType();
21341
21342 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
21343 // It is likely not profitable to do this for f64 because a double-precision
21344 // reciprocal estimate with refinement on x86 prior to FMA requires
21345 // 15 instructions: convert to single, rcpss, convert back to double, refine
21346 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
21347 // along with FMA, this could be a throughput win.
21348
21349 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
21350 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
21351 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
21352 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
21353 // Enable estimate codegen with 1 refinement step for vector division.
21354 // Scalar division estimates are disabled because they break too much
21355 // real-world code. These defaults are intended to match GCC behavior.
21356 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
21357 return SDValue();
21358
21359 if (RefinementSteps == ReciprocalEstimate::Unspecified)
21360 RefinementSteps = 1;
21361
21362 // There is no FSQRT for 512-bits, but there is RCP14.
21363 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
21364 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
21365 }
21366 return SDValue();
21367}
21368
21369/// If we have at least two divisions that use the same divisor, convert to
21370/// multiplication by a reciprocal. This may need to be adjusted for a given
21371/// CPU if a division's cost is not at least twice the cost of a multiplication.
21372/// This is because we still need one division to calculate the reciprocal and
21373/// then we need two multiplies by that reciprocal as replacements for the
21374/// original divisions.
21375unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
21376 return 2;
21377}
21378
21379SDValue
21380X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
21381 SelectionDAG &DAG,
21382 SmallVectorImpl<SDNode *> &Created) const {
21383 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
21384 if (isIntDivCheap(N->getValueType(0), Attr))
21385 return SDValue(N,0); // Lower SDIV as SDIV
21386
21387 assert((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&(((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!") ? static_cast<void> (0) : __assert_fail
("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21388, __PRETTY_FUNCTION__))
21388 "Unexpected divisor!")(((Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) &&
"Unexpected divisor!") ? static_cast<void> (0) : __assert_fail
("(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()) && \"Unexpected divisor!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21388, __PRETTY_FUNCTION__))
;
21389
21390 // Only perform this transform if CMOV is supported otherwise the select
21391 // below will become a branch.
21392 if (!Subtarget.hasCMov())
21393 return SDValue();
21394
21395 // fold (sdiv X, pow2)
21396 EVT VT = N->getValueType(0);
21397 // FIXME: Support i8.
21398 if (VT != MVT::i16 && VT != MVT::i32 &&
21399 !(Subtarget.is64Bit() && VT == MVT::i64))
21400 return SDValue();
21401
21402 unsigned Lg2 = Divisor.countTrailingZeros();
21403
21404 // If the divisor is 2 or -2, the default expansion is better.
21405 if (Lg2 == 1)
21406 return SDValue();
21407
21408 SDLoc DL(N);
21409 SDValue N0 = N->getOperand(0);
21410 SDValue Zero = DAG.getConstant(0, DL, VT);
21411 APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
21412 SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
21413
21414 // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
21415 SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
21416 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
21417 SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
21418
21419 Created.push_back(Cmp.getNode());
21420 Created.push_back(Add.getNode());
21421 Created.push_back(CMov.getNode());
21422
21423 // Divide by pow2.
21424 SDValue SRA =
21425 DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
21426
21427 // If we're dividing by a positive value, we're done. Otherwise, we must
21428 // negate the result.
21429 if (Divisor.isNonNegative())
21430 return SRA;
21431
21432 Created.push_back(SRA.getNode());
21433 return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
21434}
21435
21436/// Result of 'and' is compared against zero. Change to a BT node if possible.
21437/// Returns the BT node and the condition code needed to use it.
21438static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
21439 const SDLoc &dl, SelectionDAG &DAG,
21440 SDValue &X86CC) {
21441 assert(And.getOpcode() == ISD::AND && "Expected AND node!")((And.getOpcode() == ISD::AND && "Expected AND node!"
) ? static_cast<void> (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21441, __PRETTY_FUNCTION__))
;
21442 SDValue Op0 = And.getOperand(0);
21443 SDValue Op1 = And.getOperand(1);
21444 if (Op0.getOpcode() == ISD::TRUNCATE)
21445 Op0 = Op0.getOperand(0);
21446 if (Op1.getOpcode() == ISD::TRUNCATE)
21447 Op1 = Op1.getOperand(0);
21448
21449 SDValue Src, BitNo;
21450 if (Op1.getOpcode() == ISD::SHL)
21451 std::swap(Op0, Op1);
21452 if (Op0.getOpcode() == ISD::SHL) {
21453 if (isOneConstant(Op0.getOperand(0))) {
21454 // If we looked past a truncate, check that it's only truncating away
21455 // known zeros.
21456 unsigned BitWidth = Op0.getValueSizeInBits();
21457 unsigned AndBitWidth = And.getValueSizeInBits();
21458 if (BitWidth > AndBitWidth) {
21459 KnownBits Known = DAG.computeKnownBits(Op0);
21460 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
21461 return SDValue();
21462 }
21463 Src = Op1;
21464 BitNo = Op0.getOperand(1);
21465 }
21466 } else if (Op1.getOpcode() == ISD::Constant) {
21467 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
21468 uint64_t AndRHSVal = AndRHS->getZExtValue();
21469 SDValue AndLHS = Op0;
21470
21471 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
21472 Src = AndLHS.getOperand(0);
21473 BitNo = AndLHS.getOperand(1);
21474 } else {
21475 // Use BT if the immediate can't be encoded in a TEST instruction or we
21476 // are optimizing for size and the immedaite won't fit in a byte.
21477 bool OptForSize = DAG.shouldOptForSize();
21478 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
21479 isPowerOf2_64(AndRHSVal)) {
21480 Src = AndLHS;
21481 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
21482 Src.getValueType());
21483 }
21484 }
21485 }
21486
21487 // No patterns found, give up.
21488 if (!Src.getNode())
21489 return SDValue();
21490
21491 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
21492 // instruction. Since the shift amount is in-range-or-undefined, we know
21493 // that doing a bittest on the i32 value is ok. We extend to i32 because
21494 // the encoding for the i16 version is larger than the i32 version.
21495 // Also promote i16 to i32 for performance / code size reason.
21496 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
21497 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
21498
21499 // See if we can use the 32-bit instruction instead of the 64-bit one for a
21500 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
21501 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
21502 // known to be zero.
21503 if (Src.getValueType() == MVT::i64 &&
21504 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
21505 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
21506
21507 // If the operand types disagree, extend the shift amount to match. Since
21508 // BT ignores high bits (like shifts) we can use anyextend.
21509 if (Src.getValueType() != BitNo.getValueType())
21510 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
21511
21512 X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
21513 dl, MVT::i8);
21514 return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
21515}
21516
21517/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
21518/// CMPs.
21519static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
21520 SDValue &Op1, bool &IsAlwaysSignaling) {
21521 unsigned SSECC;
21522 bool Swap = false;
21523
21524 // SSE Condition code mapping:
21525 // 0 - EQ
21526 // 1 - LT
21527 // 2 - LE
21528 // 3 - UNORD
21529 // 4 - NEQ
21530 // 5 - NLT
21531 // 6 - NLE
21532 // 7 - ORD
21533 switch (SetCCOpcode) {
21534 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21534)
;
21535 case ISD::SETOEQ:
21536 case ISD::SETEQ: SSECC = 0; break;
21537 case ISD::SETOGT:
21538 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21539 case ISD::SETLT:
21540 case ISD::SETOLT: SSECC = 1; break;
21541 case ISD::SETOGE:
21542 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21543 case ISD::SETLE:
21544 case ISD::SETOLE: SSECC = 2; break;
21545 case ISD::SETUO: SSECC = 3; break;
21546 case ISD::SETUNE:
21547 case ISD::SETNE: SSECC = 4; break;
21548 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21549 case ISD::SETUGE: SSECC = 5; break;
21550 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21551 case ISD::SETUGT: SSECC = 6; break;
21552 case ISD::SETO: SSECC = 7; break;
21553 case ISD::SETUEQ: SSECC = 8; break;
21554 case ISD::SETONE: SSECC = 12; break;
21555 }
21556 if (Swap)
21557 std::swap(Op0, Op1);
21558
21559 switch (SetCCOpcode) {
21560 default:
21561 IsAlwaysSignaling = true;
21562 break;
21563 case ISD::SETEQ:
21564 case ISD::SETOEQ:
21565 case ISD::SETUEQ:
21566 case ISD::SETNE:
21567 case ISD::SETONE:
21568 case ISD::SETUNE:
21569 case ISD::SETO:
21570 case ISD::SETUO:
21571 IsAlwaysSignaling = false;
21572 break;
21573 }
21574
21575 return SSECC;
21576}
21577
21578/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
21579/// concatenate the result back.
21580static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
21581 MVT VT = Op.getSimpleValueType();
21582
21583 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC
&& "Unsupported value type for operation") ? static_cast
<void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21584, __PRETTY_FUNCTION__))
21584 "Unsupported value type for operation")((VT.is256BitVector() && Op.getOpcode() == ISD::SETCC
&& "Unsupported value type for operation") ? static_cast
<void> (0) : __assert_fail ("VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21584, __PRETTY_FUNCTION__))
;
21585
21586 unsigned NumElems = VT.getVectorNumElements();
21587 SDLoc dl(Op);
21588 SDValue CC = Op.getOperand(2);
21589
21590 // Extract the LHS vectors
21591 SDValue LHS = Op.getOperand(0);
21592 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21593 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21594
21595 // Extract the RHS vectors
21596 SDValue RHS = Op.getOperand(1);
21597 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21598 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21599
21600 // Issue the operation on the smaller types and concatenate the result back
21601 MVT EltVT = VT.getVectorElementType();
21602 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21603 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21604 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
21605 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
21606}
21607
21608static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
21609
21610 SDValue Op0 = Op.getOperand(0);
21611 SDValue Op1 = Op.getOperand(1);
21612 SDValue CC = Op.getOperand(2);
21613 MVT VT = Op.getSimpleValueType();
21614 SDLoc dl(Op);
21615
21616 assert(VT.getVectorElementType() == MVT::i1 &&((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21617, __PRETTY_FUNCTION__))
21617 "Cannot set masked compare for this operation")((VT.getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21617, __PRETTY_FUNCTION__))
;
21618
21619 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
21620
21621 // Prefer SETGT over SETLT.
21622 if (SetCCOpcode == ISD::SETLT) {
21623 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
21624 std::swap(Op0, Op1);
21625 }
21626
21627 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
21628}
21629
21630/// Given a buildvector constant, return a new vector constant with each element
21631/// incremented or decremented. If incrementing or decrementing would result in
21632/// unsigned overflow or underflow or this is not a simple vector constant,
21633/// return an empty value.
21634static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {
21635 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
21636 if (!BV)
21637 return SDValue();
21638
21639 MVT VT = V.getSimpleValueType();
21640 MVT EltVT = VT.getVectorElementType();
21641 unsigned NumElts = VT.getVectorNumElements();
21642 SmallVector<SDValue, 8> NewVecC;
21643 SDLoc DL(V);
21644 for (unsigned i = 0; i < NumElts; ++i) {
21645 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
21646 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
21647 return SDValue();
21648
21649 // Avoid overflow/underflow.
21650 const APInt &EltC = Elt->getAPIntValue();
21651 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isNullValue()))
21652 return SDValue();
21653
21654 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
21655 }
21656
21657 return DAG.getBuildVector(VT, DL, NewVecC);
21658}
21659
21660/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
21661/// Op0 u<= Op1:
21662/// t = psubus Op0, Op1
21663/// pcmpeq t, <0..0>
21664static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
21665 ISD::CondCode Cond, const SDLoc &dl,
21666 const X86Subtarget &Subtarget,
21667 SelectionDAG &DAG) {
21668 if (!Subtarget.hasSSE2())
21669 return SDValue();
21670
21671 MVT VET = VT.getVectorElementType();
21672 if (VET != MVT::i8 && VET != MVT::i16)
21673 return SDValue();
21674
21675 switch (Cond) {
21676 default:
21677 return SDValue();
21678 case ISD::SETULT: {
21679 // If the comparison is against a constant we can turn this into a
21680 // setule. With psubus, setule does not require a swap. This is
21681 // beneficial because the constant in the register is no longer
21682 // destructed as the destination so it can be hoisted out of a loop.
21683 // Only do this pre-AVX since vpcmp* is no longer destructive.
21684 if (Subtarget.hasAVX())
21685 return SDValue();
21686 SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);
21687 if (!ULEOp1)
21688 return SDValue();
21689 Op1 = ULEOp1;
21690 break;
21691 }
21692 case ISD::SETUGT: {
21693 // If the comparison is against a constant, we can turn this into a setuge.
21694 // This is beneficial because materializing a constant 0 for the PCMPEQ is
21695 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
21696 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
21697 SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);
21698 if (!UGEOp1)
21699 return SDValue();
21700 Op1 = Op0;
21701 Op0 = UGEOp1;
21702 break;
21703 }
21704 // Psubus is better than flip-sign because it requires no inversion.
21705 case ISD::SETUGE:
21706 std::swap(Op0, Op1);
21707 break;
21708 case ISD::SETULE:
21709 break;
21710 }
21711
21712 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
21713 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
21714 DAG.getConstant(0, dl, VT));
21715}
21716
21717static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
21718 SelectionDAG &DAG) {
21719 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
21720 Op.getOpcode() == ISD::STRICT_FSETCCS;
21721 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
21722 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
21723 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
21724 MVT VT = Op->getSimpleValueType(0);
21725 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
21726 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
21727 SDLoc dl(Op);
21728
21729 if (isFP) {
21730#ifndef NDEBUG
21731 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
21732 assert(EltVT == MVT::f32 || EltVT == MVT::f64)((EltVT == MVT::f32 || EltVT == MVT::f64) ? static_cast<void
> (0) : __assert_fail ("EltVT == MVT::f32 || EltVT == MVT::f64"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21732, __PRETTY_FUNCTION__))
;
21733#endif
21734
21735 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
21736 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21737
21738 // If we have a strict compare with a vXi1 result and the input is 128/256
21739 // bits we can't use a masked compare unless we have VLX. If we use a wider
21740 // compare like we do for non-strict, we might trigger spurious exceptions
21741 // from the upper elements. Instead emit a AVX compare and convert to mask.
21742 unsigned Opc;
21743 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
21744 (!IsStrict || Subtarget.hasVLX() ||
21745 Op0.getSimpleValueType().is512BitVector())) {
21746 assert(VT.getVectorNumElements() <= 16)((VT.getVectorNumElements() <= 16) ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() <= 16", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21746, __PRETTY_FUNCTION__))
;
21747 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
21748 } else {
21749 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
21750 // The SSE/AVX packed FP comparison nodes are defined with a
21751 // floating-point vector result that matches the operand type. This allows
21752 // them to work with an SSE1 target (integer vector types are not legal).
21753 VT = Op0.getSimpleValueType();
21754 }
21755
21756 SDValue Cmp;
21757 bool IsAlwaysSignaling;
21758 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
21759 if (!Subtarget.hasAVX()) {
21760 // TODO: We could use following steps to handle a quiet compare with
21761 // signaling encodings.
21762 // 1. Get ordered masks from a quiet ISD::SETO
21763 // 2. Use the masks to mask potential unordered elements in operand A, B
21764 // 3. Get the compare results of masked A, B
21765 // 4. Calculating final result using the mask and result from 3
21766 // But currently, we just fall back to scalar operations.
21767 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
21768 return SDValue();
21769
21770 // Insert an extra signaling instruction to raise exception.
21771 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
21772 SDValue SignalCmp = DAG.getNode(
21773 Opc, dl, {VT, MVT::Other},
21774 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
21775 // FIXME: It seems we need to update the flags of all new strict nodes.
21776 // Otherwise, mayRaiseFPException in MI will return false due to
21777 // NoFPExcept = false by default. However, I didn't find it in other
21778 // patches.
21779 SignalCmp->setFlags(Op->getFlags());
21780 Chain = SignalCmp.getValue(1);
21781 }
21782
21783 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
21784 // emit two comparisons and a logic op to tie them together.
21785 if (SSECC >= 8) {
21786 // LLVM predicate is SETUEQ or SETONE.
21787 unsigned CC0, CC1;
21788 unsigned CombineOpc;
21789 if (Cond == ISD::SETUEQ) {
21790 CC0 = 3; // UNORD
21791 CC1 = 0; // EQ
21792 CombineOpc = X86ISD::FOR;
21793 } else {
21794 assert(Cond == ISD::SETONE)((Cond == ISD::SETONE) ? static_cast<void> (0) : __assert_fail
("Cond == ISD::SETONE", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21794, __PRETTY_FUNCTION__))
;
21795 CC0 = 7; // ORD
21796 CC1 = 4; // NEQ
21797 CombineOpc = X86ISD::FAND;
21798 }
21799
21800 SDValue Cmp0, Cmp1;
21801 if (IsStrict) {
21802 Cmp0 = DAG.getNode(
21803 Opc, dl, {VT, MVT::Other},
21804 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
21805 Cmp1 = DAG.getNode(
21806 Opc, dl, {VT, MVT::Other},
21807 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
21808 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
21809 Cmp1.getValue(1));
21810 } else {
21811 Cmp0 = DAG.getNode(
21812 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
21813 Cmp1 = DAG.getNode(
21814 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
21815 }
21816 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
21817 } else {
21818 if (IsStrict) {
21819 Cmp = DAG.getNode(
21820 Opc, dl, {VT, MVT::Other},
21821 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
21822 Chain = Cmp.getValue(1);
21823 } else
21824 Cmp = DAG.getNode(
21825 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
21826 }
21827 } else {
21828 // Handle all other FP comparisons here.
21829 if (IsStrict) {
21830 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
21831 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
21832 Cmp = DAG.getNode(
21833 Opc, dl, {VT, MVT::Other},
21834 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
21835 Chain = Cmp.getValue(1);
21836 } else
21837 Cmp = DAG.getNode(
21838 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
21839 }
21840
21841 if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
21842 // We emitted a compare with an XMM/YMM result. Finish converting to a
21843 // mask register using a vptestm.
21844 EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
21845 Cmp = DAG.getBitcast(CastVT, Cmp);
21846 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
21847 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
21848 } else {
21849 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
21850 // the result type of SETCC. The bitcast is expected to be optimized
21851 // away during combining/isel.
21852 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
21853 }
21854
21855 if (IsStrict)
21856 return DAG.getMergeValues({Cmp, Chain}, dl);
21857
21858 return Cmp;
21859 }
21860
21861 assert(!IsStrict && "Strict SETCC only handles FP operands.")((!IsStrict && "Strict SETCC only handles FP operands."
) ? static_cast<void> (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21861, __PRETTY_FUNCTION__))
;
21862
21863 MVT VTOp0 = Op0.getSimpleValueType();
21864 (void)VTOp0;
21865 assert(VTOp0 == Op1.getSimpleValueType() &&((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"
) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21866, __PRETTY_FUNCTION__))
21866 "Expected operands with same type!")((VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!"
) ? static_cast<void> (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21866, __PRETTY_FUNCTION__))
;
21867 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&((VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21868, __PRETTY_FUNCTION__))
21868 "Invalid number of packed elements for source and destination!")((VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
"Invalid number of packed elements for source and destination!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21868, __PRETTY_FUNCTION__))
;
21869
21870 // The non-AVX512 code below works under the assumption that source and
21871 // destination types are the same.
21872 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21873, __PRETTY_FUNCTION__))
21873 "Value types for source and destination must be the same!")(((Subtarget.hasAVX512() || (VT == VTOp0)) && "Value types for source and destination must be the same!"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21873, __PRETTY_FUNCTION__))
;
21874
21875 // The result is boolean, but operands are int/float
21876 if (VT.getVectorElementType() == MVT::i1) {
21877 // In AVX-512 architecture setcc returns mask with i1 elements,
21878 // But there is no compare instruction for i8 and i16 elements in KNL.
21879 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()
) && "Unexpected operand type") ? static_cast<void
> (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21880, __PRETTY_FUNCTION__))
21880 "Unexpected operand type")(((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()
) && "Unexpected operand type") ? static_cast<void
> (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21880, __PRETTY_FUNCTION__))
;
21881 return LowerIntVSETCC_AVX512(Op, DAG);
21882 }
21883
21884 // Lower using XOP integer comparisons.
21885 if (VT.is128BitVector() && Subtarget.hasXOP()) {
21886 // Translate compare code to XOP PCOM compare mode.
21887 unsigned CmpMode = 0;
21888 switch (Cond) {
21889 default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21889)
;
21890 case ISD::SETULT:
21891 case ISD::SETLT: CmpMode = 0x00; break;
21892 case ISD::SETULE:
21893 case ISD::SETLE: CmpMode = 0x01; break;
21894 case ISD::SETUGT:
21895 case ISD::SETGT: CmpMode = 0x02; break;
21896 case ISD::SETUGE:
21897 case ISD::SETGE: CmpMode = 0x03; break;
21898 case ISD::SETEQ: CmpMode = 0x04; break;
21899 case ISD::SETNE: CmpMode = 0x05; break;
21900 }
21901
21902 // Are we comparing unsigned or signed integers?
21903 unsigned Opc =
21904 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
21905
21906 return DAG.getNode(Opc, dl, VT, Op0, Op1,
21907 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
21908 }
21909
21910 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
21911 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
21912 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
21913 SDValue BC0 = peekThroughBitcasts(Op0);
21914 if (BC0.getOpcode() == ISD::AND) {
21915 APInt UndefElts;
21916 SmallVector<APInt, 64> EltBits;
21917 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
21918 VT.getScalarSizeInBits(), UndefElts,
21919 EltBits, false, false)) {
21920 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
21921 Cond = ISD::SETEQ;
21922 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
21923 }
21924 }
21925 }
21926 }
21927
21928 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
21929 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
21930 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
21931 ConstantSDNode *C1 = isConstOrConstSplat(Op1);
21932 if (C1 && C1->getAPIntValue().isPowerOf2()) {
21933 unsigned BitWidth = VT.getScalarSizeInBits();
21934 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
21935
21936 SDValue Result = Op0.getOperand(0);
21937 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
21938 DAG.getConstant(ShiftAmt, dl, VT));
21939 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
21940 DAG.getConstant(BitWidth - 1, dl, VT));
21941 return Result;
21942 }
21943 }
21944
21945 // Break 256-bit integer vector compare into smaller ones.
21946 if (VT.is256BitVector() && !Subtarget.hasInt256())
21947 return Lower256IntVSETCC(Op, DAG);
21948
21949 // If this is a SETNE against the signed minimum value, change it to SETGT.
21950 // If this is a SETNE against the signed maximum value, change it to SETLT.
21951 // which will be swapped to SETGT.
21952 // Otherwise we use PCMPEQ+invert.
21953 APInt ConstValue;
21954 if (Cond == ISD::SETNE &&
21955 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
21956 if (ConstValue.isMinSignedValue())
21957 Cond = ISD::SETGT;
21958 else if (ConstValue.isMaxSignedValue())
21959 Cond = ISD::SETLT;
21960 }
21961
21962 // If both operands are known non-negative, then an unsigned compare is the
21963 // same as a signed compare and there's no need to flip signbits.
21964 // TODO: We could check for more general simplifications here since we're
21965 // computing known bits.
21966 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
21967 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
21968
21969 // Special case: Use min/max operations for unsigned compares.
21970 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21971 if (ISD::isUnsignedIntSetCC(Cond) &&
21972 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
21973 TLI.isOperationLegal(ISD::UMIN, VT)) {
21974 // If we have a constant operand, increment/decrement it and change the
21975 // condition to avoid an invert.
21976 if (Cond == ISD::SETUGT) {
21977 // X > C --> X >= (C+1) --> X == umax(X, C+1)
21978 if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {
21979 Op1 = UGTOp1;
21980 Cond = ISD::SETUGE;
21981 }
21982 }
21983 if (Cond == ISD::SETULT) {
21984 // X < C --> X <= (C-1) --> X == umin(X, C-1)
21985 if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {
21986 Op1 = ULTOp1;
21987 Cond = ISD::SETULE;
21988 }
21989 }
21990 bool Invert = false;
21991 unsigned Opc;
21992 switch (Cond) {
21993 default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21993)
;
21994 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21995 case ISD::SETULE: Opc = ISD::UMIN; break;
21996 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH[[gnu::fallthrough]];
21997 case ISD::SETUGE: Opc = ISD::UMAX; break;
21998 }
21999
22000 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
22001 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
22002
22003 // If the logical-not of the result is required, perform that now.
22004 if (Invert)
22005 Result = DAG.getNOT(dl, Result, VT);
22006
22007 return Result;
22008 }
22009
22010 // Try to use SUBUS and PCMPEQ.
22011 if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
22012 return V;
22013
22014 // We are handling one of the integer comparisons here. Since SSE only has
22015 // GT and EQ comparisons for integer, swapping operands and multiple
22016 // operations may be required for some comparisons.
22017 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
22018 : X86ISD::PCMPGT;
22019 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
22020 Cond == ISD::SETGE || Cond == ISD::SETUGE;
22021 bool Invert = Cond == ISD::SETNE ||
22022 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
22023
22024 if (Swap)
22025 std::swap(Op0, Op1);
22026
22027 // Check that the operation in question is available (most are plain SSE2,
22028 // but PCMPGTQ and PCMPEQQ have different requirements).
22029 if (VT == MVT::v2i64) {
22030 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
22031 assert(Subtarget.hasSSE2() && "Don't know how to lower!")((Subtarget.hasSSE2() && "Don't know how to lower!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22031, __PRETTY_FUNCTION__))
;
22032
22033 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
22034 // the odd elements over the even elements.
22035 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
22036 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
22037 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
22038
22039 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
22040 static const int MaskHi[] = { 1, 1, 3, 3 };
22041 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
22042
22043 return DAG.getBitcast(VT, Result);
22044 }
22045
22046 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
22047 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
22048 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
22049
22050 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
22051 static const int MaskHi[] = { 1, 1, 3, 3 };
22052 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
22053
22054 return DAG.getBitcast(VT, Result);
22055 }
22056
22057 // Since SSE has no unsigned integer comparisons, we need to flip the sign
22058 // bits of the inputs before performing those operations. The lower
22059 // compare is always unsigned.
22060 SDValue SB;
22061 if (FlipSigns) {
22062 SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
22063 } else {
22064 SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
22065 }
22066 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
22067 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
22068
22069 // Cast everything to the right type.
22070 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
22071 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
22072
22073 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
22074 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
22075 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
22076
22077 // Create masks for only the low parts/high parts of the 64 bit integers.
22078 static const int MaskHi[] = { 1, 1, 3, 3 };
22079 static const int MaskLo[] = { 0, 0, 2, 2 };
22080 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
22081 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
22082 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
22083
22084 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
22085 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
22086
22087 if (Invert)
22088 Result = DAG.getNOT(dl, Result, MVT::v4i32);
22089
22090 return DAG.getBitcast(VT, Result);
22091 }
22092
22093 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
22094 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
22095 // pcmpeqd + pshufd + pand.
22096 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")((Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22096, __PRETTY_FUNCTION__))
;
22097
22098 // First cast everything to the right type.
22099 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
22100 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
22101
22102 // Do the compare.
22103 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
22104
22105 // Make sure the lower and upper halves are both all-ones.
22106 static const int Mask[] = { 1, 0, 3, 2 };
22107 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
22108 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
22109
22110 if (Invert)
22111 Result = DAG.getNOT(dl, Result, MVT::v4i32);
22112
22113 return DAG.getBitcast(VT, Result);
22114 }
22115 }
22116
22117 // Since SSE has no unsigned integer comparisons, we need to flip the sign
22118 // bits of the inputs before performing those operations.
22119 if (FlipSigns) {
22120 MVT EltVT = VT.getVectorElementType();
22121 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
22122 VT);
22123 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
22124 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
22125 }
22126
22127 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
22128
22129 // If the logical-not of the result is required, perform that now.
22130 if (Invert)
22131 Result = DAG.getNOT(dl, Result, VT);
22132
22133 return Result;
22134}
22135
22136// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
22137static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
22138 const SDLoc &dl, SelectionDAG &DAG,
22139 const X86Subtarget &Subtarget,
22140 SDValue &X86CC) {
22141 // Only support equality comparisons.
22142 if (CC != ISD::SETEQ && CC != ISD::SETNE)
22143 return SDValue();
22144
22145 // Must be a bitcast from vXi1.
22146 if (Op0.getOpcode() != ISD::BITCAST)
22147 return SDValue();
22148
22149 Op0 = Op0.getOperand(0);
22150 MVT VT = Op0.getSimpleValueType();
22151 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
22152 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
22153 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
22154 return SDValue();
22155
22156 X86::CondCode X86Cond;
22157 if (isNullConstant(Op1)) {
22158 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
22159 } else if (isAllOnesConstant(Op1)) {
22160 // C flag is set for all ones.
22161 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
22162 } else
22163 return SDValue();
22164
22165 // If the input is an AND, we can combine it's operands into the KTEST.
22166 bool KTestable = false;
22167 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
22168 KTestable = true;
22169 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
22170 KTestable = true;
22171 if (!isNullConstant(Op1))
22172 KTestable = false;
22173 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
22174 SDValue LHS = Op0.getOperand(0);
22175 SDValue RHS = Op0.getOperand(1);
22176 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
22177 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
22178 }
22179
22180 // If the input is an OR, we can combine it's operands into the KORTEST.
22181 SDValue LHS = Op0;
22182 SDValue RHS = Op0;
22183 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
22184 LHS = Op0.getOperand(0);
22185 RHS = Op0.getOperand(1);
22186 }
22187
22188 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
22189 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
22190}
22191
22192/// Emit flags for the given setcc condition and operands. Also returns the
22193/// corresponding X86 condition code constant in X86CC.
22194SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
22195 ISD::CondCode CC, const SDLoc &dl,
22196 SelectionDAG &DAG,
22197 SDValue &X86CC) const {
22198 // Optimize to BT if possible.
22199 // Lower (X & (1 << N)) == 0 to BT(X, N).
22200 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
22201 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
22202 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
22203 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
22204 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
22205 return BT;
22206 }
22207
22208 // Try to use PTEST for a tree ORs equality compared with 0.
22209 // TODO: We could do AND tree with all 1s as well by using the C flag.
22210 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
22211 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
22212 if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
22213 return PTEST;
22214 }
22215
22216 // Try to lower using KORTEST or KTEST.
22217 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
22218 return Test;
22219
22220 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
22221 // these.
22222 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
22223 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
22224 // If the input is a setcc, then reuse the input setcc or use a new one with
22225 // the inverted condition.
22226 if (Op0.getOpcode() == X86ISD::SETCC) {
22227 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
22228
22229 X86CC = Op0.getOperand(0);
22230 if (Invert) {
22231 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
22232 CCode = X86::GetOppositeBranchCondition(CCode);
22233 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
22234 }
22235
22236 return Op0.getOperand(1);
22237 }
22238 }
22239
22240 // Try to use the carry flag from the add in place of an separate CMP for:
22241 // (seteq (add X, -1), -1). Similar for setne.
22242 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
22243 Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
22244 if (isProfitableToUseFlagOp(Op0)) {
22245 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
22246
22247 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
22248 Op0.getOperand(1));
22249 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
22250 X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
22251 X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
22252 return SDValue(New.getNode(), 1);
22253 }
22254 }
22255
22256 X86::CondCode CondCode =
22257 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
22258 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")((CondCode != X86::COND_INVALID && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22258, __PRETTY_FUNCTION__))
;
22259
22260 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
22261 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
22262 return EFLAGS;
22263}
22264
22265SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
22266
22267 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
22268 Op.getOpcode() == ISD::STRICT_FSETCCS;
22269 MVT VT = Op->getSimpleValueType(0);
22270
22271 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
22272
22273 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")((VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22273, __PRETTY_FUNCTION__))
;
22274 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22275 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
22276 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
22277 SDLoc dl(Op);
22278 ISD::CondCode CC =
22279 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
22280
22281 // Handle f128 first, since one possible outcome is a normal integer
22282 // comparison which gets handled by emitFlagsForSetcc.
22283 if (Op0.getValueType() == MVT::f128) {
22284 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
22285 Op.getOpcode() == ISD::STRICT_FSETCCS);
22286
22287 // If softenSetCCOperands returned a scalar, use it.
22288 if (!Op1.getNode()) {
22289 assert(Op0.getValueType() == Op.getValueType() &&((Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"
) ? static_cast<void> (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22290, __PRETTY_FUNCTION__))
22290 "Unexpected setcc expansion!")((Op0.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"
) ? static_cast<void> (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22290, __PRETTY_FUNCTION__))
;
22291 if (IsStrict)
22292 return DAG.getMergeValues({Op0, Chain}, dl);
22293 return Op0;
22294 }
22295 }
22296
22297 if (Op0.getSimpleValueType().isInteger()) {
22298 SDValue X86CC;
22299 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
22300 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
22301 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
22302 }
22303
22304 // Handle floating point.
22305 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
22306 if (CondCode == X86::COND_INVALID)
22307 return SDValue();
22308
22309 SDValue EFLAGS;
22310 if (IsStrict) {
22311 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
22312 EFLAGS =
22313 DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
22314 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
22315 Chain = EFLAGS.getValue(1);
22316 } else {
22317 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
22318 }
22319
22320 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
22321 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
22322 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
22323}
22324
22325SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
22326 SDValue LHS = Op.getOperand(0);
22327 SDValue RHS = Op.getOperand(1);
22328 SDValue Carry = Op.getOperand(2);
22329 SDValue Cond = Op.getOperand(3);
22330 SDLoc DL(Op);
22331
22332 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")((LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."
) ? static_cast<void> (0) : __assert_fail ("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22332, __PRETTY_FUNCTION__))
;
22333 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
22334
22335 // Recreate the carry if needed.
22336 EVT CarryVT = Carry.getValueType();
22337 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
22338 Carry, DAG.getAllOnesConstant(DL, CarryVT));
22339
22340 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
22341 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
22342 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
22343}
22344
22345// This function returns three things: the arithmetic computation itself
22346// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
22347// flag and the condition code define the case in which the arithmetic
22348// computation overflows.
22349static std::pair<SDValue, SDValue>
22350getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
22351 assert(Op.getResNo() == 0 && "Unexpected result number!")((Op.getResNo() == 0 && "Unexpected result number!") ?
static_cast<void> (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22351, __PRETTY_FUNCTION__))
;
22352 SDValue Value, Overflow;
22353 SDValue LHS = Op.getOperand(0);
22354 SDValue RHS = Op.getOperand(1);
22355 unsigned BaseOp = 0;
22356 SDLoc DL(Op);
22357 switch (Op.getOpcode()) {
22358 default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22358)
;
22359 case ISD::SADDO:
22360 BaseOp = X86ISD::ADD;
22361 Cond = X86::COND_O;
22362 break;
22363 case ISD::UADDO:
22364 BaseOp = X86ISD::ADD;
22365 Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;
22366 break;
22367 case ISD::SSUBO:
22368 BaseOp = X86ISD::SUB;
22369 Cond = X86::COND_O;
22370 break;
22371 case ISD::USUBO:
22372 BaseOp = X86ISD::SUB;
22373 Cond = X86::COND_B;
22374 break;
22375 case ISD::SMULO:
22376 BaseOp = X86ISD::SMUL;
22377 Cond = X86::COND_O;
22378 break;
22379 case ISD::UMULO:
22380 BaseOp = X86ISD::UMUL;
22381 Cond = X86::COND_O;
22382 break;
22383 }
22384
22385 if (BaseOp) {
22386 // Also sets EFLAGS.
22387 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22388 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22389 Overflow = Value.getValue(1);
22390 }
22391
22392 return std::make_pair(Value, Overflow);
22393}
22394
22395static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22396 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22397 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22398 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22399 // has only one use.
22400 SDLoc DL(Op);
22401 X86::CondCode Cond;
22402 SDValue Value, Overflow;
22403 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
22404
22405 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
22406 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")((Op->getValueType(1) == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22406, __PRETTY_FUNCTION__))
;
22407 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
22408}
22409
22410/// Return true if opcode is a X86 logical comparison.
22411static bool isX86LogicalCmp(SDValue Op) {
22412 unsigned Opc = Op.getOpcode();
22413 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
22414 Opc == X86ISD::FCMP)
22415 return true;
22416 if (Op.getResNo() == 1 &&
22417 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
22418 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
22419 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
22420 return true;
22421
22422 return false;
22423}
22424
22425static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
22426 if (V.getOpcode() != ISD::TRUNCATE)
22427 return false;
22428
22429 SDValue VOp0 = V.getOperand(0);
22430 unsigned InBits = VOp0.getValueSizeInBits();
22431 unsigned Bits = V.getValueSizeInBits();
22432 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
22433}
22434
22435SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
22436 bool AddTest = true;
22437 SDValue Cond = Op.getOperand(0);
22438 SDValue Op1 = Op.getOperand(1);
22439 SDValue Op2 = Op.getOperand(2);
22440 SDLoc DL(Op);
22441 MVT VT = Op1.getSimpleValueType();
22442 SDValue CC;
22443
22444 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
22445 // are available or VBLENDV if AVX is available.
22446 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
22447 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
22448 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
22449 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
22450 bool IsAlwaysSignaling;
22451 unsigned SSECC =
22452 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
22453 CondOp0, CondOp1, IsAlwaysSignaling);
22454
22455 if (Subtarget.hasAVX512()) {
22456 SDValue Cmp =
22457 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
22458 DAG.getTargetConstant(SSECC, DL, MVT::i8));
22459 assert(!VT.isVector() && "Not a scalar type?")((!VT.isVector() && "Not a scalar type?") ? static_cast
<void> (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22459, __PRETTY_FUNCTION__))
;
22460 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
22461 }
22462
22463 if (SSECC < 8 || Subtarget.hasAVX()) {
22464 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
22465 DAG.getTargetConstant(SSECC, DL, MVT::i8));
22466
22467 // If we have AVX, we can use a variable vector select (VBLENDV) instead
22468 // of 3 logic instructions for size savings and potentially speed.
22469 // Unfortunately, there is no scalar form of VBLENDV.
22470
22471 // If either operand is a +0.0 constant, don't try this. We can expect to
22472 // optimize away at least one of the logic instructions later in that
22473 // case, so that sequence would be faster than a variable blend.
22474
22475 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
22476 // uses XMM0 as the selection register. That may need just as many
22477 // instructions as the AND/ANDN/OR sequence due to register moves, so
22478 // don't bother.
22479 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
22480 !isNullFPConstant(Op2)) {
22481 // Convert to vectors, do a VSELECT, and convert back to scalar.
22482 // All of the conversions should be optimized away.
22483 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
22484 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
22485 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
22486 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
22487
22488 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
22489 VCmp = DAG.getBitcast(VCmpVT, VCmp);
22490
22491 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
22492
22493 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
22494 VSel, DAG.getIntPtrConstant(0, DL));
22495 }
22496 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
22497 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
22498 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
22499 }
22500 }
22501
22502 // AVX512 fallback is to lower selects of scalar floats to masked moves.
22503 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
22504 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
22505 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
22506 }
22507
22508 if (Cond.getOpcode() == ISD::SETCC) {
22509 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
22510 Cond = NewCond;
22511 // If the condition was updated, it's possible that the operands of the
22512 // select were also updated (for example, EmitTest has a RAUW). Refresh
22513 // the local references to the select operands in case they got stale.
22514 Op1 = Op.getOperand(1);
22515 Op2 = Op.getOperand(2);
22516 }
22517 }
22518
22519 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
22520 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
22521 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
22522 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
22523 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
22524 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
22525 if (Cond.getOpcode() == X86ISD::SETCC &&
22526 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
22527 isNullConstant(Cond.getOperand(1).getOperand(1))) {
22528 SDValue Cmp = Cond.getOperand(1);
22529 unsigned CondCode = Cond.getConstantOperandVal(0);
22530
22531 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
22532 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
22533 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
22534 SDValue CmpOp0 = Cmp.getOperand(0);
22535
22536 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22537 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
22538
22539 // Apply further optimizations for special cases
22540 // (select (x != 0), -1, 0) -> neg & sbb
22541 // (select (x == 0), 0, -1) -> neg & sbb
22542 if (isNullConstant(Y) &&
22543 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
22544 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
22545 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
22546 Zero = DAG.getConstant(0, DL, Op.getValueType());
22547 return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
22548 }
22549
22550 Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
22551 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
22552
22553 SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
22554 SDValue Res = // Res = 0 or -1.
22555 DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
22556
22557 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
22558 Res = DAG.getNOT(DL, Res, Res.getValueType());
22559
22560 return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
22561 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
22562 Cmp.getOperand(0).getOpcode() == ISD::AND &&
22563 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
22564 SDValue CmpOp0 = Cmp.getOperand(0);
22565 SDValue Src1, Src2;
22566 // true if Op2 is XOR or OR operator and one of its operands
22567 // is equal to Op1
22568 // ( a , a op b) || ( b , a op b)
22569 auto isOrXorPattern = [&]() {
22570 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
22571 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
22572 Src1 =
22573 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
22574 Src2 = Op1;
22575 return true;
22576 }
22577 return false;
22578 };
22579
22580 if (isOrXorPattern()) {
22581 SDValue Neg;
22582 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
22583 // we need mask of all zeros or ones with same size of the other
22584 // operands.
22585 if (CmpSz > VT.getSizeInBits())
22586 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
22587 else if (CmpSz < VT.getSizeInBits())
22588 Neg = DAG.getNode(ISD::AND, DL, VT,
22589 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
22590 DAG.getConstant(1, DL, VT));
22591 else
22592 Neg = CmpOp0;
22593 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
22594 Neg); // -(and (x, 0x1))
22595 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
22596 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
22597 }
22598 }
22599 }
22600
22601 // Look past (and (setcc_carry (cmp ...)), 1).
22602 if (Cond.getOpcode() == ISD::AND &&
22603 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
22604 isOneConstant(Cond.getOperand(1)))
22605 Cond = Cond.getOperand(0);
22606
22607 // If condition flag is set by a X86ISD::CMP, then use it as the condition
22608 // setting operand in place of the X86ISD::SETCC.
22609 unsigned CondOpcode = Cond.getOpcode();
22610 if (CondOpcode == X86ISD::SETCC ||
22611 CondOpcode == X86ISD::SETCC_CARRY) {
22612 CC = Cond.getOperand(0);
22613
22614 SDValue Cmp = Cond.getOperand(1);
22615 bool IllegalFPCMov = false;
22616 if (VT.isFloatingPoint() && !VT.isVector() &&
22617 !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
22618 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
22619
22620 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
22621 Cmp.getOpcode() == X86ISD::BT) { // FIXME
22622 Cond = Cmp;
22623 AddTest = false;
22624 }
22625 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
22626 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
22627 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
22628 SDValue Value;
22629 X86::CondCode X86Cond;
22630 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
22631
22632 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
22633 AddTest = false;
22634 }
22635
22636 if (AddTest) {
22637 // Look past the truncate if the high bits are known zero.
22638 if (isTruncWithZeroHighBitsInput(Cond, DAG))
22639 Cond = Cond.getOperand(0);
22640
22641 // We know the result of AND is compared against zero. Try to match
22642 // it to BT.
22643 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
22644 SDValue BTCC;
22645 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
22646 CC = BTCC;
22647 Cond = BT;
22648 AddTest = false;
22649 }
22650 }
22651 }
22652
22653 if (AddTest) {
22654 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
22655 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
22656 }
22657
22658 // a < b ? -1 : 0 -> RES = ~setcc_carry
22659 // a < b ? 0 : -1 -> RES = setcc_carry
22660 // a >= b ? -1 : 0 -> RES = setcc_carry
22661 // a >= b ? 0 : -1 -> RES = ~setcc_carry
22662 if (Cond.getOpcode() == X86ISD::SUB) {
22663 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
22664
22665 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
22666 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
22667 (isNullConstant(Op1) || isNullConstant(Op2))) {
22668 SDValue Res =
22669 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
22670 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
22671 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
22672 return DAG.getNOT(DL, Res, Res.getValueType());
22673 return Res;
22674 }
22675 }
22676
22677 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
22678 // widen the cmov and push the truncate through. This avoids introducing a new
22679 // branch during isel and doesn't add any extensions.
22680 if (Op.getValueType() == MVT::i8 &&
22681 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
22682 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
22683 if (T1.getValueType() == T2.getValueType() &&
22684 // Blacklist CopyFromReg to avoid partial register stalls.
22685 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
22686 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
22687 CC, Cond);
22688 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
22689 }
22690 }
22691
22692 // Or finally, promote i8 cmovs if we have CMOV,
22693 // or i16 cmovs if it won't prevent folding a load.
22694 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
22695 // legal, but EmitLoweredSelect() can not deal with these extensions
22696 // being inserted between two CMOV's. (in i16 case too TBN)
22697 // https://bugs.llvm.org/show_bug.cgi?id=40974
22698 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
22699 (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) &&
22700 !MayFoldLoad(Op2))) {
22701 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
22702 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
22703 SDValue Ops[] = { Op2, Op1, CC, Cond };
22704 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
22705 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
22706 }
22707
22708 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
22709 // condition is true.
22710 SDValue Ops[] = { Op2, Op1, CC, Cond };
22711 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
22712}
22713
22714static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
22715 const X86Subtarget &Subtarget,
22716 SelectionDAG &DAG) {
22717 MVT VT = Op->getSimpleValueType(0);
22718 SDValue In = Op->getOperand(0);
22719 MVT InVT = In.getSimpleValueType();
22720 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")((InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"
) ? static_cast<void> (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22720, __PRETTY_FUNCTION__))
;
22721 MVT VTElt = VT.getVectorElementType();
22722 SDLoc dl(Op);
22723
22724 unsigned NumElts = VT.getVectorNumElements();
22725
22726 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
22727 MVT ExtVT = VT;
22728 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
22729 // If v16i32 is to be avoided, we'll need to split and concatenate.
22730 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
22731 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
22732
22733 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
22734 }
22735
22736 // Widen to 512-bits if VLX is not supported.
22737 MVT WideVT = ExtVT;
22738 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
22739 NumElts *= 512 / ExtVT.getSizeInBits();
22740 InVT = MVT::getVectorVT(MVT::i1, NumElts);
22741 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
22742 In, DAG.getIntPtrConstant(0, dl));
22743 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
22744 }
22745
22746 SDValue V;
22747 MVT WideEltVT = WideVT.getVectorElementType();
22748 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
22749 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
22750 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
22751 } else {
22752 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
22753 SDValue Zero = DAG.getConstant(0, dl, WideVT);
22754 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
22755 }
22756
22757 // Truncate if we had to extend i16/i8 above.
22758 if (VT != ExtVT) {
22759 WideVT = MVT::getVectorVT(VTElt, NumElts);
22760 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
22761 }
22762
22763 // Extract back to 128/256-bit if we widened.
22764 if (WideVT != VT)
22765 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
22766 DAG.getIntPtrConstant(0, dl));
22767
22768 return V;
22769}
22770
22771static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
22772 SelectionDAG &DAG) {
22773 SDValue In = Op->getOperand(0);
22774 MVT InVT = In.getSimpleValueType();
22775
22776 if (InVT.getVectorElementType() == MVT::i1)
22777 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
22778
22779 assert(Subtarget.hasAVX() && "Expected AVX support")((Subtarget.hasAVX() && "Expected AVX support") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22779, __PRETTY_FUNCTION__))
;
22780 return LowerAVXExtend(Op, DAG, Subtarget);
22781}
22782
22783// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
22784// For sign extend this needs to handle all vector sizes and SSE4.1 and
22785// non-SSE4.1 targets. For zero extend this should only handle inputs of
22786// MVT::v64i8 when BWI is not supported, but AVX512 is.
22787static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
22788 const X86Subtarget &Subtarget,
22789 SelectionDAG &DAG) {
22790 SDValue In = Op->getOperand(0);
22791 MVT VT = Op->getSimpleValueType(0);
22792 MVT InVT = In.getSimpleValueType();
22793
22794 MVT SVT = VT.getVectorElementType();
22795 MVT InSVT = InVT.getVectorElementType();
22796 assert(SVT.getSizeInBits() > InSVT.getSizeInBits())((SVT.getSizeInBits() > InSVT.getSizeInBits()) ? static_cast
<void> (0) : __assert_fail ("SVT.getSizeInBits() > InSVT.getSizeInBits()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22796, __PRETTY_FUNCTION__))
;
22797
22798 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
22799 return SDValue();
22800 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
22801 return SDValue();
22802 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
22803 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
22804 !(VT.is512BitVector() && Subtarget.hasAVX512()))
22805 return SDValue();
22806
22807 SDLoc dl(Op);
22808 unsigned Opc = Op.getOpcode();
22809 unsigned NumElts = VT.getVectorNumElements();
22810
22811 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
22812 // For 512-bit vectors, we need 128-bits or 256-bits.
22813 if (InVT.getSizeInBits() > 128) {
22814 // Input needs to be at least the same number of elements as output, and
22815 // at least 128-bits.
22816 int InSize = InSVT.getSizeInBits() * NumElts;
22817 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
22818 InVT = In.getSimpleValueType();
22819 }
22820
22821 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
22822 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
22823 // need to be handled here for 256/512-bit results.
22824 if (Subtarget.hasInt256()) {
22825 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")((VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22825, __PRETTY_FUNCTION__))
;
22826
22827 if (InVT.getVectorNumElements() != NumElts)
22828 return DAG.getNode(Op.getOpcode(), dl, VT, In);
22829
22830 // FIXME: Apparently we create inreg operations that could be regular
22831 // extends.
22832 unsigned ExtOpc =
22833 Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
22834 : ISD::ZERO_EXTEND;
22835 return DAG.getNode(ExtOpc, dl, VT, In);
22836 }
22837
22838 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
22839 if (Subtarget.hasAVX()) {
22840 assert(VT.is256BitVector() && "256-bit vector expected")((VT.is256BitVector() && "256-bit vector expected") ?
static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22840, __PRETTY_FUNCTION__))
;
22841 MVT HalfVT = VT.getHalfNumVectorElementsVT();
22842 int HalfNumElts = HalfVT.getVectorNumElements();
22843
22844 unsigned NumSrcElts = InVT.getVectorNumElements();
22845 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
22846 for (int i = 0; i != HalfNumElts; ++i)
22847 HiMask[i] = HalfNumElts + i;
22848
22849 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
22850 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
22851 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
22852 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
22853 }
22854
22855 // We should only get here for sign extend.
22856 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")((Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22856, __PRETTY_FUNCTION__))
;
22857 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")((VT.is128BitVector() && InVT.is128BitVector() &&
"Unexpected VTs") ? static_cast<void> (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22857, __PRETTY_FUNCTION__))
;
22858
22859 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
22860 SDValue Curr = In;
22861 SDValue SignExt = Curr;
22862
22863 // As SRAI is only available on i16/i32 types, we expand only up to i32
22864 // and handle i64 separately.
22865 if (InVT != MVT::v4i32) {
22866 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
22867
22868 unsigned DestWidth = DestVT.getScalarSizeInBits();
22869 unsigned Scale = DestWidth / InSVT.getSizeInBits();
22870
22871 unsigned InNumElts = InVT.getVectorNumElements();
22872 unsigned DestElts = DestVT.getVectorNumElements();
22873
22874 // Build a shuffle mask that takes each input element and places it in the
22875 // MSBs of the new element size.
22876 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
22877 for (unsigned i = 0; i != DestElts; ++i)
22878 Mask[i * Scale + (Scale - 1)] = i;
22879
22880 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
22881 Curr = DAG.getBitcast(DestVT, Curr);
22882
22883 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
22884 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
22885 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
22886 }
22887
22888 if (VT == MVT::v2i64) {
22889 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")((Curr.getValueType() == MVT::v4i32 && "Unexpected input VT"
) ? static_cast<void> (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22889, __PRETTY_FUNCTION__))
;
22890 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
22891 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
22892 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
22893 SignExt = DAG.getBitcast(VT, SignExt);
22894 }
22895
22896 return SignExt;
22897}
22898
22899static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
22900 SelectionDAG &DAG) {
22901 MVT VT = Op->getSimpleValueType(0);
22902 SDValue In = Op->getOperand(0);
22903 MVT InVT = In.getSimpleValueType();
22904 SDLoc dl(Op);
22905
22906 if (InVT.getVectorElementType() == MVT::i1)
22907 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
22908
22909 assert(VT.isVector() && InVT.isVector() && "Expected vector type")((VT.isVector() && InVT.isVector() && "Expected vector type"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22909, __PRETTY_FUNCTION__))
;
22910 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22911, __PRETTY_FUNCTION__))
22911 "Expected same number of elements")((VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements") ? static_cast<void>
(0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22911, __PRETTY_FUNCTION__))
;
22912 assert((VT.getVectorElementType() == MVT::i16 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22915, __PRETTY_FUNCTION__))
22913 VT.getVectorElementType() == MVT::i32 ||(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22915, __PRETTY_FUNCTION__))
22914 VT.getVectorElementType() == MVT::i64) &&(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22915, __PRETTY_FUNCTION__))
22915 "Unexpected element type")(((VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType
() == MVT::i32 || VT.getVectorElementType() == MVT::i64) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22915, __PRETTY_FUNCTION__))
;
22916 assert((InVT.getVectorElementType() == MVT::i8 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22919, __PRETTY_FUNCTION__))
22917 InVT.getVectorElementType() == MVT::i16 ||(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22919, __PRETTY_FUNCTION__))
22918 InVT.getVectorElementType() == MVT::i32) &&(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22919, __PRETTY_FUNCTION__))
22919 "Unexpected element type")(((InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType
() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type") ? static_cast<void> (0) : __assert_fail
("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22919, __PRETTY_FUNCTION__))
;
22920
22921 // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
22922 if (InVT == MVT::v8i8) {
22923 if (VT != MVT::v8i64)
22924 return SDValue();
22925
22926 In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
22927 MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
22928 return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
22929 }
22930
22931 if (Subtarget.hasInt256())
22932 return Op;
22933
22934 // Optimize vectors in AVX mode
22935 // Sign extend v8i16 to v8i32 and
22936 // v4i32 to v4i64
22937 //
22938 // Divide input vector into two parts
22939 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
22940 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
22941 // concat the vectors to original VT
22942 MVT HalfVT = VT.getHalfNumVectorElementsVT();
22943 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
22944
22945 unsigned NumElems = InVT.getVectorNumElements();
22946 SmallVector<int,8> ShufMask(NumElems, -1);
22947 for (unsigned i = 0; i != NumElems/2; ++i)
22948 ShufMask[i] = i + NumElems/2;
22949
22950 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
22951 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
22952
22953 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
22954}
22955
22956/// Change a vector store into a pair of half-size vector stores.
22957static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
22958 SDValue StoredVal = Store->getValue();
22959 assert((StoredVal.getValueType().is256BitVector() ||(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22961, __PRETTY_FUNCTION__))
22960 StoredVal.getValueType().is512BitVector()) &&(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22961, __PRETTY_FUNCTION__))
22961 "Expecting 256/512-bit op")(((StoredVal.getValueType().is256BitVector() || StoredVal.getValueType
().is512BitVector()) && "Expecting 256/512-bit op") ?
static_cast<void> (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22961, __PRETTY_FUNCTION__))
;
22962
22963 // Splitting volatile memory ops is not allowed unless the operation was not
22964 // legal to begin with. Assume the input store is legal (this transform is
22965 // only used for targets with AVX). Note: It is possible that we have an
22966 // illegal type like v2i128, and so we could allow splitting a volatile store
22967 // in that case if that is important.
22968 if (!Store->isSimple())
22969 return SDValue();
22970
22971 EVT StoreVT = StoredVal.getValueType();
22972 unsigned NumElems = StoreVT.getVectorNumElements();
22973 unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
22974 unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
22975
22976 SDLoc DL(Store);
22977 SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
22978 SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
22979 SDValue Ptr0 = Store->getBasePtr();
22980 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
22981 unsigned Alignment = Store->getAlignment();
22982 SDValue Ch0 =
22983 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
22984 Alignment, Store->getMemOperand()->getFlags());
22985 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
22986 Store->getPointerInfo().getWithOffset(HalfAlign),
22987 MinAlign(Alignment, HalfAlign),
22988 Store->getMemOperand()->getFlags());
22989 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
22990}
22991
22992/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
22993/// type.
22994static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
22995 SelectionDAG &DAG) {
22996 SDValue StoredVal = Store->getValue();
22997 assert(StoreVT.is128BitVector() &&((StoreVT.is128BitVector() && StoredVal.getValueType(
).is128BitVector() && "Expecting 128-bit op") ? static_cast
<void> (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22998, __PRETTY_FUNCTION__))
22998 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")((StoreVT.is128BitVector() && StoredVal.getValueType(
).is128BitVector() && "Expecting 128-bit op") ? static_cast
<void> (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22998, __PRETTY_FUNCTION__))
;
22999 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
23000
23001 // Splitting volatile memory ops is not allowed unless the operation was not
23002 // legal to begin with. We are assuming the input op is legal (this transform
23003 // is only used for targets with AVX).
23004 if (!Store->isSimple())
23005 return SDValue();
23006
23007 MVT StoreSVT = StoreVT.getScalarType();
23008 unsigned NumElems = StoreVT.getVectorNumElements();
23009 unsigned ScalarSize = StoreSVT.getStoreSize();
23010 unsigned Alignment = Store->getAlignment();
23011
23012 SDLoc DL(Store);
23013 SmallVector<SDValue, 4> Stores;
23014 for (unsigned i = 0; i != NumElems; ++i) {
23015 unsigned Offset = i * ScalarSize;
23016 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
23017 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
23018 DAG.getIntPtrConstant(i, DL));
23019 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
23020 Store->getPointerInfo().getWithOffset(Offset),
23021 MinAlign(Alignment, Offset),
23022 Store->getMemOperand()->getFlags());
23023 Stores.push_back(Ch);
23024 }
23025 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
23026}
23027
23028static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
23029 SelectionDAG &DAG) {
23030 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
23031 SDLoc dl(St);
23032 SDValue StoredVal = St->getValue();
23033
23034 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
23035 if (StoredVal.getValueType().isVector() &&
23036 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
23037 assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&((StoredVal.getValueType().getVectorNumElements() <= 8 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoredVal.getValueType().getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23038, __PRETTY_FUNCTION__))
23038 "Unexpected VT")((StoredVal.getValueType().getVectorNumElements() <= 8 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoredVal.getValueType().getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23038, __PRETTY_FUNCTION__))
;
23039 assert(!St->isTruncatingStore() && "Expected non-truncating store")((!St->isTruncatingStore() && "Expected non-truncating store"
) ? static_cast<void> (0) : __assert_fail ("!St->isTruncatingStore() && \"Expected non-truncating store\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23039, __PRETTY_FUNCTION__))
;
23040 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23041, __PRETTY_FUNCTION__))
23041 "Expected AVX512F without AVX512DQI")((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23041, __PRETTY_FUNCTION__))
;
23042
23043 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
23044 DAG.getUNDEF(MVT::v16i1), StoredVal,
23045 DAG.getIntPtrConstant(0, dl));
23046 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
23047 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
23048
23049 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
23050 St->getPointerInfo(), St->getAlignment(),
23051 St->getMemOperand()->getFlags());
23052 }
23053
23054 if (St->isTruncatingStore())
23055 return SDValue();
23056
23057 // If this is a 256-bit store of concatenated ops, we are better off splitting
23058 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
23059 // and each half can execute independently. Some cores would split the op into
23060 // halves anyway, so the concat (vinsertf128) is purely an extra op.
23061 MVT StoreVT = StoredVal.getSimpleValueType();
23062 if (StoreVT.is256BitVector()) {
23063 SmallVector<SDValue, 4> CatOps;
23064 if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
23065 return splitVectorStore(St, DAG);
23066 return SDValue();
23067 }
23068
23069 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23070 assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&((StoreVT.isVector() && StoreVT.getSizeInBits() == 64
&& "Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23071, __PRETTY_FUNCTION__))
23071 "Unexpected VT")((StoreVT.isVector() && StoreVT.getSizeInBits() == 64
&& "Unexpected VT") ? static_cast<void> (0) : __assert_fail
("StoreVT.isVector() && StoreVT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23071, __PRETTY_FUNCTION__))
;
23072 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==((TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering
::TypeWidenVector && "Unexpected type action!") ? static_cast
<void> (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23073, __PRETTY_FUNCTION__))
23073 TargetLowering::TypeWidenVector && "Unexpected type action!")((TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering
::TypeWidenVector && "Unexpected type action!") ? static_cast
<void> (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23073, __PRETTY_FUNCTION__))
;
23074
23075 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
23076 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
23077 DAG.getUNDEF(StoreVT));
23078
23079 if (Subtarget.hasSSE2()) {
23080 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
23081 // and store it.
23082 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
23083 MVT CastVT = MVT::getVectorVT(StVT, 2);
23084 StoredVal = DAG.getBitcast(CastVT, StoredVal);
23085 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
23086 DAG.getIntPtrConstant(0, dl));
23087
23088 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
23089 St->getPointerInfo(), St->getAlignment(),
23090 St->getMemOperand()->getFlags());
23091 }
23092 assert(Subtarget.hasSSE1() && "Expected SSE")((Subtarget.hasSSE1() && "Expected SSE") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23092, __PRETTY_FUNCTION__))
;
23093 SDVTList Tys = DAG.getVTList(MVT::Other);
23094 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
23095 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
23096 St->getMemOperand());
23097}
23098
23099// Lower vector extended loads using a shuffle. If SSSE3 is not available we
23100// may emit an illegal shuffle but the expansion is still better than scalar
23101// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
23102// we'll emit a shuffle and a arithmetic shift.
23103// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
23104// TODO: It is possible to support ZExt by zeroing the undef values during
23105// the shuffle phase or after the shuffle.
23106static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
23107 SelectionDAG &DAG) {
23108 MVT RegVT = Op.getSimpleValueType();
23109 assert(RegVT.isVector() && "We only custom lower vector loads.")((RegVT.isVector() && "We only custom lower vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23109, __PRETTY_FUNCTION__))
;
23110 assert(RegVT.isInteger() &&((RegVT.isInteger() && "We only custom lower integer vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23111, __PRETTY_FUNCTION__))
23111 "We only custom lower integer vector loads.")((RegVT.isInteger() && "We only custom lower integer vector loads."
) ? static_cast<void> (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23111, __PRETTY_FUNCTION__))
;
23112
23113 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
23114 SDLoc dl(Ld);
23115
23116 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
23117 if (RegVT.getVectorElementType() == MVT::i1) {
23118 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")((EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load"
) ? static_cast<void> (0) : __assert_fail ("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23118, __PRETTY_FUNCTION__))
;
23119 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")((RegVT.getVectorNumElements() <= 8 && "Unexpected VT"
) ? static_cast<void> (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23119, __PRETTY_FUNCTION__))
;
23120 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23121, __PRETTY_FUNCTION__))
23121 "Expected AVX512F without AVX512DQI")((Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
"Expected AVX512F without AVX512DQI") ? static_cast<void>
(0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23121, __PRETTY_FUNCTION__))
;
23122
23123 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
23124 Ld->getPointerInfo(), Ld->getAlignment(),
23125 Ld->getMemOperand()->getFlags());
23126
23127 // Replace chain users with the new chain.
23128 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")((NewLd->getNumValues() == 2 && "Loads must carry a chain!"
) ? static_cast<void> (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23128, __PRETTY_FUNCTION__))
;
23129
23130 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
23131 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
23132 DAG.getBitcast(MVT::v16i1, Val),
23133 DAG.getIntPtrConstant(0, dl));
23134 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
23135 }
23136
23137 return SDValue();
23138}
23139
23140/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
23141/// each of which has no other use apart from the AND / OR.
23142static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
23143 Opc = Op.getOpcode();
23144 if (Opc != ISD::OR && Opc != ISD::AND)
23145 return false;
23146 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
23147 Op.getOperand(0).hasOneUse() &&
23148 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
23149 Op.getOperand(1).hasOneUse());
23150}
23151
23152SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
23153 SDValue Chain = Op.getOperand(0);
23154 SDValue Cond = Op.getOperand(1);
23155 SDValue Dest = Op.getOperand(2);
23156 SDLoc dl(Op);
23157
23158 if (Cond.getOpcode() == ISD::SETCC &&
23159 Cond.getOperand(0).getValueType() != MVT::f128) {
23160 SDValue LHS = Cond.getOperand(0);
23161 SDValue RHS = Cond.getOperand(1);
23162 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23163
23164 // Special case for
23165 // setcc([su]{add,sub,mul}o == 0)
23166 // setcc([su]{add,sub,mul}o != 1)
23167 if (ISD::isOverflowIntrOpRes(LHS) &&
23168 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
23169 (isNullConstant(RHS) || isOneConstant(RHS))) {
23170 SDValue Value, Overflow;
23171 X86::CondCode X86Cond;
23172 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
23173
23174 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
23175 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
23176
23177 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23178 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23179 Overflow);
23180 }
23181
23182 if (LHS.getSimpleValueType().isInteger()) {
23183 SDValue CCVal;
23184 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
23185 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23186 EFLAGS);
23187 }
23188
23189 if (CC == ISD::SETOEQ) {
23190 // For FCMP_OEQ, we can emit
23191 // two branches instead of an explicit AND instruction with a
23192 // separate test. However, we only do this if this block doesn't
23193 // have a fall-through edge, because this requires an explicit
23194 // jmp when the condition is false.
23195 if (Op.getNode()->hasOneUse()) {
23196 SDNode *User = *Op.getNode()->use_begin();
23197 // Look for an unconditional branch following this conditional branch.
23198 // We need this because we need to reverse the successors in order
23199 // to implement FCMP_OEQ.
23200 if (User->getOpcode() == ISD::BR) {
23201 SDValue FalseBB = User->getOperand(1);
23202 SDNode *NewBR =
23203 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
23204 assert(NewBR == User)((NewBR == User) ? static_cast<void> (0) : __assert_fail
("NewBR == User", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23204, __PRETTY_FUNCTION__))
;
23205 (void)NewBR;
23206 Dest = FalseBB;
23207
23208 SDValue Cmp =
23209 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
23210 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
23211 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
23212 CCVal, Cmp);
23213 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
23214 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23215 Cmp);
23216 }
23217 }
23218 } else if (CC == ISD::SETUNE) {
23219 // For FCMP_UNE, we can emit
23220 // two branches instead of an explicit OR instruction with a
23221 // separate test.
23222 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
23223 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
23224 Chain =
23225 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
23226 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
23227 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23228 Cmp);
23229 } else {
23230 X86::CondCode X86Cond =
23231 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
23232 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
23233 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23234 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23235 Cmp);
23236 }
23237 }
23238
23239 if (ISD::isOverflowIntrOpRes(Cond)) {
23240 SDValue Value, Overflow;
23241 X86::CondCode X86Cond;
23242 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
23243
23244 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23245 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23246 Overflow);
23247 }
23248
23249 // Look past the truncate if the high bits are known zero.
23250 if (isTruncWithZeroHighBitsInput(Cond, DAG))
23251 Cond = Cond.getOperand(0);
23252
23253 EVT CondVT = Cond.getValueType();
23254
23255 // Add an AND with 1 if we don't already have one.
23256 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
23257 Cond =
23258 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
23259
23260 SDValue LHS = Cond;
23261 SDValue RHS = DAG.getConstant(0, dl, CondVT);
23262
23263 SDValue CCVal;
23264 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
23265 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
23266 EFLAGS);
23267}
23268
23269// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
23270// Calls to _alloca are needed to probe the stack when allocating more than 4k
23271// bytes in one go. Touching the stack at 4K increments is necessary to ensure
23272// that the guard pages used by the OS virtual memory manager are allocated in
23273// correct sequence.
23274SDValue
23275X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
23276 SelectionDAG &DAG) const {
23277 MachineFunction &MF = DAG.getMachineFunction();
23278 bool SplitStack = MF.shouldSplitStack();
23279 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
23280 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
23281 SplitStack || EmitStackProbeCall;
23282 SDLoc dl(Op);
23283
23284 // Get the inputs.
23285 SDNode *Node = Op.getNode();
23286 SDValue Chain = Op.getOperand(0);
23287 SDValue Size = Op.getOperand(1);
23288 MaybeAlign Alignment(Op.getConstantOperandVal(2));
23289 EVT VT = Node->getValueType(0);
23290
23291 // Chain the dynamic stack allocation so that it doesn't modify the stack
23292 // pointer when other instructions are using the stack.
23293 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
23294
23295 bool Is64Bit = Subtarget.is64Bit();
23296 MVT SPTy = getPointerTy(DAG.getDataLayout());
23297
23298 SDValue Result;
23299 if (!Lower) {
23300 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23301 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
23302 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23303, __PRETTY_FUNCTION__))
23303 " not tell us which reg is the stack pointer!")((SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? static_cast
<void> (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23303, __PRETTY_FUNCTION__))
;
23304
23305 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
23306 const Align StackAlign(TFI.getStackAlignment());
23307 if (hasInlineStackProbe(MF)) {
23308 MachineRegisterInfo &MRI = MF.getRegInfo();
23309
23310 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
23311 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
23312 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
23313 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
23314 DAG.getRegister(Vreg, SPTy));
23315 } else {
23316 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
23317 Chain = SP.getValue(1);
23318 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
23319 }
23320 if (Alignment && Alignment > StackAlign)
23321 Result =
23322 DAG.getNode(ISD::AND, dl, VT, Result,
23323 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
23324 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
23325 } else if (SplitStack) {
23326 MachineRegisterInfo &MRI = MF.getRegInfo();
23327
23328 if (Is64Bit) {
23329 // The 64 bit implementation of segmented stacks needs to clobber both r10
23330 // r11. This makes it impossible to use it along with nested parameters.
23331 const Function &F = MF.getFunction();
23332 for (const auto &A : F.args()) {
23333 if (A.hasNestAttr())
23334 report_fatal_error("Cannot use segmented stacks with functions that "
23335 "have nested arguments.");
23336 }
23337 }
23338
23339 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
23340 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
23341 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
23342 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
23343 DAG.getRegister(Vreg, SPTy));
23344 } else {
23345 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
23346 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
23347 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
23348
23349 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23350 Register SPReg = RegInfo->getStackRegister();
23351 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
23352 Chain = SP.getValue(1);
23353
23354 if (Alignment) {
23355 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
23356 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
23357 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
23358 }
23359
23360 Result = SP;
23361 }
23362
23363 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
23364 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
23365
23366 SDValue Ops[2] = {Result, Chain};
23367 return DAG.getMergeValues(Ops, dl);
23368}
23369
23370SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
23371 MachineFunction &MF = DAG.getMachineFunction();
23372 auto PtrVT = getPointerTy(MF.getDataLayout());
23373 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
23374
23375 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
23376 SDLoc DL(Op);
23377
23378 if (!Subtarget.is64Bit() ||
23379 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
23380 // vastart just stores the address of the VarArgsFrameIndex slot into the
23381 // memory location argument.
23382 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
23383 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
23384 MachinePointerInfo(SV));
23385 }
23386
23387 // __va_list_tag:
23388 // gp_offset (0 - 6 * 8)
23389 // fp_offset (48 - 48 + 8 * 16)
23390 // overflow_arg_area (point to parameters coming in memory).
23391 // reg_save_area
23392 SmallVector<SDValue, 8> MemOps;
23393 SDValue FIN = Op.getOperand(1);
23394 // Store gp_offset
23395 SDValue Store = DAG.getStore(
23396 Op.getOperand(0), DL,
23397 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
23398 MachinePointerInfo(SV));
23399 MemOps.push_back(Store);
23400
23401 // Store fp_offset
23402 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
23403 Store = DAG.getStore(
23404 Op.getOperand(0), DL,
23405 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
23406 MachinePointerInfo(SV, 4));
23407 MemOps.push_back(Store);
23408
23409 // Store ptr to overflow_arg_area
23410 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
23411 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
23412 Store =
23413 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
23414 MemOps.push_back(Store);
23415
23416 // Store ptr to reg_save_area.
23417 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
23418 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
23419 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
23420 Store = DAG.getStore(
23421 Op.getOperand(0), DL, RSFIN, FIN,
23422 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
23423 MemOps.push_back(Store);
23424 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
23425}
23426
23427SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
23428 assert(Subtarget.is64Bit() &&((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23429, __PRETTY_FUNCTION__))
23429 "LowerVAARG only handles 64-bit va_arg!")((Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23429, __PRETTY_FUNCTION__))
;
23430 assert(Op.getNumOperands() == 4)((Op.getNumOperands() == 4) ? static_cast<void> (0) : __assert_fail
("Op.getNumOperands() == 4", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23430, __PRETTY_FUNCTION__))
;
23431
23432 MachineFunction &MF = DAG.getMachineFunction();
23433 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
23434 // The Win64 ABI uses char* instead of a structure.
23435 return DAG.expandVAArg(Op.getNode());
23436
23437 SDValue Chain = Op.getOperand(0);
23438 SDValue SrcPtr = Op.getOperand(1);
23439 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
23440 unsigned Align = Op.getConstantOperandVal(3);
23441 SDLoc dl(Op);
23442
23443 EVT ArgVT = Op.getNode()->getValueType(0);
23444 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23445 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
23446 uint8_t ArgMode = 0;
23447
23448 // Decide which area this value should be read from.
23449 // TODO: Implement the AMD64 ABI in its entirety. This simple
23450 // selection mechanism works only for the basic types.
23451 if (ArgVT == MVT::f80) {
23452 llvm_unreachable("va_arg for f80 not yet implemented")::llvm::llvm_unreachable_internal("va_arg for f80 not yet implemented"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23452)
;
23453 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
23454 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
23455 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
23456 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
23457 } else {
23458 llvm_unreachable("Unhandled argument type in LowerVAARG")::llvm::llvm_unreachable_internal("Unhandled argument type in LowerVAARG"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23458)
;
23459 }
23460
23461 if (ArgMode == 2) {
23462 // Sanity Check: Make sure using fp_offset makes sense.
23463 assert(!Subtarget.useSoftFloat() &&((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23465, __PRETTY_FUNCTION__))
23464 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23465, __PRETTY_FUNCTION__))
23465 Subtarget.hasSSE1())((!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute
(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1())
? static_cast<void> (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23465, __PRETTY_FUNCTION__))
;
23466 }
23467
23468 // Insert VAARG_64 node into the DAG
23469 // VAARG_64 returns two values: Variable Argument Address, Chain
23470 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
23471 DAG.getConstant(ArgMode, dl, MVT::i8),
23472 DAG.getConstant(Align, dl, MVT::i32)};
23473 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
23474 SDValue VAARG = DAG.getMemIntrinsicNode(
23475 X86ISD::VAARG_64, dl,
23476 VTs, InstOps, MVT::i64,
23477 MachinePointerInfo(SV),
23478 /*Align=*/0,
23479 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
23480 Chain = VAARG.getValue(1);
23481
23482 // Load the next argument and return it
23483 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
23484}
23485
23486static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
23487 SelectionDAG &DAG) {
23488 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
23489 // where a va_list is still an i8*.
23490 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")((Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23490, __PRETTY_FUNCTION__))
;
23491 if (Subtarget.isCallingConvWin64(
23492 DAG.getMachineFunction().getFunction().getCallingConv()))
23493 // Probably a Win64 va_copy.
23494 return DAG.expandVACopy(Op.getNode());
23495
23496 SDValue Chain = Op.getOperand(0);
23497 SDValue DstPtr = Op.getOperand(1);
23498 SDValue SrcPtr = Op.getOperand(2);
23499 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
23500 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
23501 SDLoc DL(Op);
23502
23503 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
23504 Align(8), /*isVolatile*/ false, false, false,
23505 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
23506}
23507
23508// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
23509static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
23510 switch (Opc) {
23511 case ISD::SHL:
23512 case X86ISD::VSHL:
23513 case X86ISD::VSHLI:
23514 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
23515 case ISD::SRL:
23516 case X86ISD::VSRL:
23517 case X86ISD::VSRLI:
23518 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
23519 case ISD::SRA:
23520 case X86ISD::VSRA:
23521 case X86ISD::VSRAI:
23522 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
23523 }
23524 llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23524)
;
23525}
23526
23527/// Handle vector element shifts where the shift amount is a constant.
23528/// Takes immediate version of shift as input.
23529static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
23530 SDValue SrcOp, uint64_t ShiftAmt,
23531 SelectionDAG &DAG) {
23532 MVT ElementType = VT.getVectorElementType();
23533
23534 // Bitcast the source vector to the output type, this is mainly necessary for
23535 // vXi8/vXi64 shifts.
23536 if (VT != SrcOp.getSimpleValueType())
23537 SrcOp = DAG.getBitcast(VT, SrcOp);
23538
23539 // Fold this packed shift into its first operand if ShiftAmt is 0.
23540 if (ShiftAmt == 0)
23541 return SrcOp;
23542
23543 // Check for ShiftAmt >= element width
23544 if (ShiftAmt >= ElementType.getSizeInBits()) {
23545 if (Opc == X86ISD::VSRAI)
23546 ShiftAmt = ElementType.getSizeInBits() - 1;
23547 else
23548 return DAG.getConstant(0, dl, VT);
23549 }
23550
23551 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23552, __PRETTY_FUNCTION__))
23552 && "Unknown target vector shift-by-constant node")(((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD
::VSRAI) && "Unknown target vector shift-by-constant node"
) ? static_cast<void> (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23552, __PRETTY_FUNCTION__))
;
23553
23554 // Fold this packed vector shift into a build vector if SrcOp is a
23555 // vector of Constants or UNDEFs.
23556 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
23557 SmallVector<SDValue, 8> Elts;
23558 unsigned NumElts = SrcOp->getNumOperands();
23559
23560 switch (Opc) {
23561 default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23561)
;
23562 case X86ISD::VSHLI:
23563 for (unsigned i = 0; i != NumElts; ++i) {
23564 SDValue CurrentOp = SrcOp->getOperand(i);
23565 if (CurrentOp->isUndef()) {
23566 Elts.push_back(CurrentOp);
23567 continue;
23568 }
23569 auto *ND = cast<ConstantSDNode>(CurrentOp);
23570 const APInt &C = ND->getAPIntValue();
23571 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
23572 }
23573 break;
23574 case X86ISD::VSRLI:
23575 for (unsigned i = 0; i != NumElts; ++i) {
23576 SDValue CurrentOp = SrcOp->getOperand(i);
23577 if (CurrentOp->isUndef()) {
23578 Elts.push_back(CurrentOp);
23579 continue;
23580 }
23581 auto *ND = cast<ConstantSDNode>(CurrentOp);
23582 const APInt &C = ND->getAPIntValue();
23583 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
23584 }
23585 break;
23586 case X86ISD::VSRAI:
23587 for (unsigned i = 0; i != NumElts; ++i) {
23588 SDValue CurrentOp = SrcOp->getOperand(i);
23589 if (CurrentOp->isUndef()) {
23590 Elts.push_back(CurrentOp);
23591 continue;
23592 }
23593 auto *ND = cast<ConstantSDNode>(CurrentOp);
23594 const APInt &C = ND->getAPIntValue();
23595 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
23596 }
23597 break;
23598 }
23599
23600 return DAG.getBuildVector(VT, dl, Elts);
23601 }
23602
23603 return DAG.getNode(Opc, dl, VT, SrcOp,
23604 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
23605}
23606
23607/// Handle vector element shifts where the shift amount may or may not be a
23608/// constant. Takes immediate version of shift as input.
23609static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
23610 SDValue SrcOp, SDValue ShAmt,
23611 const X86Subtarget &Subtarget,
23612 SelectionDAG &DAG) {
23613 MVT SVT = ShAmt.getSimpleValueType();
23614 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!")(((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"
) ? static_cast<void> (0) : __assert_fail ("(SVT == MVT::i32 || SVT == MVT::i64) && \"Unexpected value type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23614, __PRETTY_FUNCTION__))
;
23615
23616 // Catch shift-by-constant.
23617 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
23618 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
23619 CShAmt->getZExtValue(), DAG);
23620
23621 // Change opcode to non-immediate version.
23622 Opc = getTargetVShiftUniformOpcode(Opc, true);
23623
23624 // Need to build a vector containing shift amount.
23625 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
23626 // +====================+============+=======================================+
23627 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
23628 // +====================+============+=======================================+
23629 // | i64 | Yes, No | Use ShAmt as lowest elt |
23630 // | i32 | Yes | zero-extend in-reg |
23631 // | (i32 zext(i16/i8)) | Yes | zero-extend in-reg |
23632 // | (i32 zext(i16/i8)) | No | byte-shift-in-reg |
23633 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
23634 // +====================+============+=======================================+
23635
23636 if (SVT == MVT::i64)
23637 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
23638 else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
23639 ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
23640 (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
23641 ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
23642 ShAmt = ShAmt.getOperand(0);
23643 MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
23644 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
23645 if (Subtarget.hasSSE41())
23646 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
23647 MVT::v2i64, ShAmt);
23648 else {
23649 SDValue ByteShift = DAG.getTargetConstant(
23650 (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
23651 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
23652 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
23653 ByteShift);
23654 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
23655 ByteShift);
23656 }
23657 } else if (Subtarget.hasSSE41() &&
23658 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23659 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
23660 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
23661 MVT::v2i64, ShAmt);
23662 } else {
23663 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
23664 DAG.getUNDEF(SVT)};
23665 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
23666 }
23667
23668 // The return type has to be a 128-bit type with the same element
23669 // type as the input type.
23670 MVT EltVT = VT.getVectorElementType();
23671 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
23672
23673 ShAmt = DAG.getBitcast(ShVT, ShAmt);
23674 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
23675}
23676
23677/// Return Mask with the necessary casting or extending
23678/// for \p Mask according to \p MaskVT when lowering masking intrinsics
23679static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
23680 const X86Subtarget &Subtarget, SelectionDAG &DAG,
23681 const SDLoc &dl) {
23682
23683 if (isAllOnesConstant(Mask))
23684 return DAG.getConstant(1, dl, MaskVT);
23685 if (X86::isZeroNode(Mask))
23686 return DAG.getConstant(0, dl, MaskVT);
23687
23688 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")((MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23688, __PRETTY_FUNCTION__))
;
23689
23690 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
23691 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")((MaskVT == MVT::v64i1 && "Expected v64i1 mask!") ? static_cast
<void> (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23691, __PRETTY_FUNCTION__))
;
23692 assert(Subtarget.hasBWI() && "Expected AVX512BW target!")((Subtarget.hasBWI() && "Expected AVX512BW target!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23692, __PRETTY_FUNCTION__))
;
23693 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
23694 SDValue Lo, Hi;
23695 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
23696 DAG.getConstant(0, dl, MVT::i32));
23697 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
23698 DAG.getConstant(1, dl, MVT::i32));
23699
23700 Lo = DAG.getBitcast(MVT::v32i1, Lo);
23701 Hi = DAG.getBitcast(MVT::v32i1, Hi);
23702
23703 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
23704 } else {
23705 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
23706 Mask.getSimpleValueType().getSizeInBits());
23707 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
23708 // are extracted by EXTRACT_SUBVECTOR.
23709 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
23710 DAG.getBitcast(BitcastVT, Mask),
23711 DAG.getIntPtrConstant(0, dl));
23712 }
23713}
23714
23715/// Return (and \p Op, \p Mask) for compare instructions or
23716/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
23717/// necessary casting or extending for \p Mask when lowering masking intrinsics
23718static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
23719 SDValue PreservedSrc,
23720 const X86Subtarget &Subtarget,
23721 SelectionDAG &DAG) {
23722 MVT VT = Op.getSimpleValueType();
23723 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23724 unsigned OpcodeSelect = ISD::VSELECT;
23725 SDLoc dl(Op);
23726
23727 if (isAllOnesConstant(Mask))
23728 return Op;
23729
23730 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
23731
23732 if (PreservedSrc.isUndef())
23733 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
23734 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
23735}
23736
23737/// Creates an SDNode for a predicated scalar operation.
23738/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
23739/// The mask is coming as MVT::i8 and it should be transformed
23740/// to MVT::v1i1 while lowering masking intrinsics.
23741/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
23742/// "X86select" instead of "vselect". We just can't create the "vselect" node
23743/// for a scalar instruction.
23744static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
23745 SDValue PreservedSrc,
23746 const X86Subtarget &Subtarget,
23747 SelectionDAG &DAG) {
23748
23749 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
23750 if (MaskConst->getZExtValue() & 0x1)
23751 return Op;
23752
23753 MVT VT = Op.getSimpleValueType();
23754 SDLoc dl(Op);
23755
23756 assert(Mask.getValueType() == MVT::i8 && "Unexpect type")((Mask.getValueType() == MVT::i8 && "Unexpect type") ?
static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23756, __PRETTY_FUNCTION__))
;
23757 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
23758 DAG.getBitcast(MVT::v8i1, Mask),
23759 DAG.getIntPtrConstant(0, dl));
23760 if (Op.getOpcode() == X86ISD::FSETCCM ||
23761 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
23762 Op.getOpcode() == X86ISD::VFPCLASSS)
23763 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
23764
23765 if (PreservedSrc.isUndef())
23766 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
23767 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
23768}
23769
23770static int getSEHRegistrationNodeSize(const Function *Fn) {
23771 if (!Fn->hasPersonalityFn())
23772 report_fatal_error(
23773 "querying registration node size for function without personality");
23774 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
23775 // WinEHStatePass for the full struct definition.
23776 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
23777 case EHPersonality::MSVC_X86SEH: return 24;
23778 case EHPersonality::MSVC_CXX: return 16;
23779 default: break;
23780 }
23781 report_fatal_error(
23782 "can only recover FP for 32-bit MSVC EH personality functions");
23783}
23784
23785/// When the MSVC runtime transfers control to us, either to an outlined
23786/// function or when returning to a parent frame after catching an exception, we
23787/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
23788/// Here's the math:
23789/// RegNodeBase = EntryEBP - RegNodeSize
23790/// ParentFP = RegNodeBase - ParentFrameOffset
23791/// Subtracting RegNodeSize takes us to the offset of the registration node, and
23792/// subtracting the offset (negative on x86) takes us back to the parent FP.
23793static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
23794 SDValue EntryEBP) {
23795 MachineFunction &MF = DAG.getMachineFunction();
23796 SDLoc dl;
23797
23798 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23799 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
23800
23801 // It's possible that the parent function no longer has a personality function
23802 // if the exceptional code was optimized away, in which case we just return
23803 // the incoming EBP.
23804 if (!Fn->hasPersonalityFn())
23805 return EntryEBP;
23806
23807 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
23808 // registration, or the .set_setframe offset.
23809 MCSymbol *OffsetSym =
23810 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
23811 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
23812 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
23813 SDValue ParentFrameOffset =
23814 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
23815
23816 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
23817 // prologue to RBP in the parent function.
23818 const X86Subtarget &Subtarget =
23819 static_cast<const X86Subtarget &>(DAG.getSubtarget());
23820 if (Subtarget.is64Bit())
23821 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
23822
23823 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
23824 // RegNodeBase = EntryEBP - RegNodeSize
23825 // ParentFP = RegNodeBase - ParentFrameOffset
23826 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
23827 DAG.getConstant(RegNodeSize, dl, PtrVT));
23828 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
23829}
23830
23831SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
23832 SelectionDAG &DAG) const {
23833 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
23834 auto isRoundModeCurDirection = [](SDValue Rnd) {
23835 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
23836 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
23837
23838 return false;
23839 };
23840 auto isRoundModeSAE = [](SDValue Rnd) {
23841 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
23842 unsigned RC = C->getZExtValue();
23843 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
23844 // Clear the NO_EXC bit and check remaining bits.
23845 RC ^= X86::STATIC_ROUNDING::NO_EXC;
23846 // As a convenience we allow no other bits or explicitly
23847 // current direction.
23848 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
23849 }
23850 }
23851
23852 return false;
23853 };
23854 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
23855 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
23856 RC = C->getZExtValue();
23857 if (RC & X86::STATIC_ROUNDING::NO_EXC) {
23858 // Clear the NO_EXC bit and check remaining bits.
23859 RC ^= X86::STATIC_ROUNDING::NO_EXC;
23860 return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||
23861 RC == X86::STATIC_ROUNDING::TO_NEG_INF ||
23862 RC == X86::STATIC_ROUNDING::TO_POS_INF ||
23863 RC == X86::STATIC_ROUNDING::TO_ZERO;
23864 }
23865 }
23866
23867 return false;
23868 };
23869
23870 SDLoc dl(Op);
23871 unsigned IntNo = Op.getConstantOperandVal(0);
23872 MVT VT = Op.getSimpleValueType();
23873 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
23874
23875 if (IntrData) {
23876 switch(IntrData->Type) {
23877 case INTR_TYPE_1OP: {
23878 // We specify 2 possible opcodes for intrinsics with rounding modes.
23879 // First, we check if the intrinsic may have non-default rounding mode,
23880 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
23881 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
23882 if (IntrWithRoundingModeOpcode != 0) {
23883 SDValue Rnd = Op.getOperand(2);
23884 unsigned RC = 0;
23885 if (isRoundModeSAEToX(Rnd, RC))
23886 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
23887 Op.getOperand(1),
23888 DAG.getTargetConstant(RC, dl, MVT::i32));
23889 if (!isRoundModeCurDirection(Rnd))
23890 return SDValue();
23891 }
23892 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
23893 Op.getOperand(1));
23894 }
23895 case INTR_TYPE_1OP_SAE: {
23896 SDValue Sae = Op.getOperand(2);
23897
23898 unsigned Opc;
23899 if (isRoundModeCurDirection(Sae))
23900 Opc = IntrData->Opc0;
23901 else if (isRoundModeSAE(Sae))
23902 Opc = IntrData->Opc1;
23903 else
23904 return SDValue();
23905
23906 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
23907 }
23908 case INTR_TYPE_2OP: {
23909 SDValue Src2 = Op.getOperand(2);
23910
23911 // We specify 2 possible opcodes for intrinsics with rounding modes.
23912 // First, we check if the intrinsic may have non-default rounding mode,
23913 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
23914 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
23915 if (IntrWithRoundingModeOpcode != 0) {
23916 SDValue Rnd = Op.getOperand(3);
23917 unsigned RC = 0;
23918 if (isRoundModeSAEToX(Rnd, RC))
23919 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
23920 Op.getOperand(1), Src2,
23921 DAG.getTargetConstant(RC, dl, MVT::i32));
23922 if (!isRoundModeCurDirection(Rnd))
23923 return SDValue();
23924 }
23925
23926 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
23927 Op.getOperand(1), Src2);
23928 }
23929 case INTR_TYPE_2OP_SAE: {
23930 SDValue Sae = Op.getOperand(3);
23931
23932 unsigned Opc;
23933 if (isRoundModeCurDirection(Sae))
23934 Opc = IntrData->Opc0;
23935 else if (isRoundModeSAE(Sae))
23936 Opc = IntrData->Opc1;
23937 else
23938 return SDValue();
23939
23940 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
23941 Op.getOperand(2));
23942 }
23943 case INTR_TYPE_3OP:
23944 case INTR_TYPE_3OP_IMM8: {
23945 SDValue Src1 = Op.getOperand(1);
23946 SDValue Src2 = Op.getOperand(2);
23947 SDValue Src3 = Op.getOperand(3);
23948
23949 // We specify 2 possible opcodes for intrinsics with rounding modes.
23950 // First, we check if the intrinsic may have non-default rounding mode,
23951 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
23952 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
23953 if (IntrWithRoundingModeOpcode != 0) {
23954 SDValue Rnd = Op.getOperand(4);
23955 unsigned RC = 0;
23956 if (isRoundModeSAEToX(Rnd, RC))
23957 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
23958 Src1, Src2, Src3,
23959 DAG.getTargetConstant(RC, dl, MVT::i32));
23960 if (!isRoundModeCurDirection(Rnd))
23961 return SDValue();
23962 }
23963
23964 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
23965 {Src1, Src2, Src3});
23966 }
23967 case INTR_TYPE_4OP:
23968 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
23969 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
23970 case INTR_TYPE_1OP_MASK: {
23971 SDValue Src = Op.getOperand(1);
23972 SDValue PassThru = Op.getOperand(2);
23973 SDValue Mask = Op.getOperand(3);
23974 // We add rounding mode to the Node when
23975 // - RC Opcode is specified and
23976 // - RC is not "current direction".
23977 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
23978 if (IntrWithRoundingModeOpcode != 0) {
23979 SDValue Rnd = Op.getOperand(4);
23980 unsigned RC = 0;
23981 if (isRoundModeSAEToX(Rnd, RC))
23982 return getVectorMaskingNode(
23983 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
23984 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
23985 Mask, PassThru, Subtarget, DAG);
23986 if (!isRoundModeCurDirection(Rnd))
23987 return SDValue();
23988 }
23989 return getVectorMaskingNode(
23990 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
23991 Subtarget, DAG);
23992 }
23993 case INTR_TYPE_1OP_MASK_SAE: {
23994 SDValue Src = Op.getOperand(1);
23995 SDValue PassThru = Op.getOperand(2);
23996 SDValue Mask = Op.getOperand(3);
23997 SDValue Rnd = Op.getOperand(4);
23998
23999 unsigned Opc;
24000 if (isRoundModeCurDirection(Rnd))
24001 Opc = IntrData->Opc0;
24002 else if (isRoundModeSAE(Rnd))
24003 Opc = IntrData->Opc1;
24004 else
24005 return SDValue();
24006
24007 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
24008 Subtarget, DAG);
24009 }
24010 case INTR_TYPE_SCALAR_MASK: {
24011 SDValue Src1 = Op.getOperand(1);
24012 SDValue Src2 = Op.getOperand(2);
24013 SDValue passThru = Op.getOperand(3);
24014 SDValue Mask = Op.getOperand(4);
24015 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
24016 // There are 2 kinds of intrinsics in this group:
24017 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
24018 // (2) With rounding mode and sae - 7 operands.
24019 bool HasRounding = IntrWithRoundingModeOpcode != 0;
24020 if (Op.getNumOperands() == (5U + HasRounding)) {
24021 if (HasRounding) {
24022 SDValue Rnd = Op.getOperand(5);
24023 unsigned RC = 0;
24024 if (isRoundModeSAEToX(Rnd, RC))
24025 return getScalarMaskingNode(
24026 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
24027 DAG.getTargetConstant(RC, dl, MVT::i32)),
24028 Mask, passThru, Subtarget, DAG);
24029 if (!isRoundModeCurDirection(Rnd))
24030 return SDValue();
24031 }
24032 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
24033 Src2),
24034 Mask, passThru, Subtarget, DAG);
24035 }
24036
24037 assert(Op.getNumOperands() == (6U + HasRounding) &&((Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"
) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24038, __PRETTY_FUNCTION__))
24038 "Unexpected intrinsic form")((Op.getNumOperands() == (6U + HasRounding) && "Unexpected intrinsic form"
) ? static_cast<void> (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24038, __PRETTY_FUNCTION__))
;
24039 SDValue RoundingMode = Op.getOperand(5);
24040 unsigned Opc = IntrData->Opc0;
24041 if (HasRounding) {
24042 SDValue Sae = Op.getOperand(6);
24043 if (isRoundModeSAE(Sae))
24044 Opc = IntrWithRoundingModeOpcode;
24045 else if (!isRoundModeCurDirection(Sae))
24046 return SDValue();
24047 }
24048 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
24049 Src2, RoundingMode),
24050 Mask, passThru, Subtarget, DAG);
24051 }
24052 case INTR_TYPE_SCALAR_MASK_RND: {
24053 SDValue Src1 = Op.getOperand(1);
24054 SDValue Src2 = Op.getOperand(2);
24055 SDValue passThru = Op.getOperand(3);
24056 SDValue Mask = Op.getOperand(4);
24057 SDValue Rnd = Op.getOperand(5);
24058
24059 SDValue NewOp;
24060 unsigned RC = 0;
24061 if (isRoundModeCurDirection(Rnd))
24062 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
24063 else if (isRoundModeSAEToX(Rnd, RC))
24064 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
24065 DAG.getTargetConstant(RC, dl, MVT::i32));
24066 else
24067 return SDValue();
24068
24069 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
24070 }
24071 case INTR_TYPE_SCALAR_MASK_SAE: {
24072 SDValue Src1 = Op.getOperand(1);
24073 SDValue Src2 = Op.getOperand(2);
24074 SDValue passThru = Op.getOperand(3);
24075 SDValue Mask = Op.getOperand(4);
24076 SDValue Sae = Op.getOperand(5);
24077 unsigned Opc;
24078 if (isRoundModeCurDirection(Sae))
24079 Opc = IntrData->Opc0;
24080 else if (isRoundModeSAE(Sae))
24081 Opc = IntrData->Opc1;
24082 else
24083 return SDValue();
24084
24085 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
24086 Mask, passThru, Subtarget, DAG);
24087 }
24088 case INTR_TYPE_2OP_MASK: {
24089 SDValue Src1 = Op.getOperand(1);
24090 SDValue Src2 = Op.getOperand(2);
24091 SDValue PassThru = Op.getOperand(3);
24092 SDValue Mask = Op.getOperand(4);
24093 SDValue NewOp;
24094 if (IntrData->Opc1 != 0) {
24095 SDValue Rnd = Op.getOperand(5);
24096 unsigned RC = 0;
24097 if (isRoundModeSAEToX(Rnd, RC))
24098 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
24099 DAG.getTargetConstant(RC, dl, MVT::i32));
24100 else if (!isRoundModeCurDirection(Rnd))
24101 return SDValue();
24102 }
24103 if (!NewOp)
24104 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
24105 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
24106 }
24107 case INTR_TYPE_2OP_MASK_SAE: {
24108 SDValue Src1 = Op.getOperand(1);
24109 SDValue Src2 = Op.getOperand(2);
24110 SDValue PassThru = Op.getOperand(3);
24111 SDValue Mask = Op.getOperand(4);
24112
24113 unsigned Opc = IntrData->Opc0;
24114 if (IntrData->Opc1 != 0) {
24115 SDValue Sae = Op.getOperand(5);
24116 if (isRoundModeSAE(Sae))
24117 Opc = IntrData->Opc1;
24118 else if (!isRoundModeCurDirection(Sae))
24119 return SDValue();
24120 }
24121
24122 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
24123 Mask, PassThru, Subtarget, DAG);
24124 }
24125 case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
24126 SDValue Src1 = Op.getOperand(1);
24127 SDValue Src2 = Op.getOperand(2);
24128 SDValue Src3 = Op.getOperand(3);
24129 SDValue PassThru = Op.getOperand(4);
24130 SDValue Mask = Op.getOperand(5);
24131 SDValue Sae = Op.getOperand(6);
24132 unsigned Opc;
24133 if (isRoundModeCurDirection(Sae))
24134 Opc = IntrData->Opc0;
24135 else if (isRoundModeSAE(Sae))
24136 Opc = IntrData->Opc1;
24137 else
24138 return SDValue();
24139
24140 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
24141 Mask, PassThru, Subtarget, DAG);
24142 }
24143 case INTR_TYPE_3OP_MASK_SAE: {
24144 SDValue Src1 = Op.getOperand(1);
24145 SDValue Src2 = Op.getOperand(2);
24146 SDValue Src3 = Op.getOperand(3);
24147 SDValue PassThru = Op.getOperand(4);
24148 SDValue Mask = Op.getOperand(5);
24149
24150 unsigned Opc = IntrData->Opc0;
24151 if (IntrData->Opc1 != 0) {
24152 SDValue Sae = Op.getOperand(6);
24153 if (isRoundModeSAE(Sae))
24154 Opc = IntrData->Opc1;
24155 else if (!isRoundModeCurDirection(Sae))
24156 return SDValue();
24157 }
24158 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
24159 Mask, PassThru, Subtarget, DAG);
24160 }
24161 case BLENDV: {
24162 SDValue Src1 = Op.getOperand(1);
24163 SDValue Src2 = Op.getOperand(2);
24164 SDValue Src3 = Op.getOperand(3);
24165
24166 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
24167 Src3 = DAG.getBitcast(MaskVT, Src3);
24168
24169 // Reverse the operands to match VSELECT order.
24170 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
24171 }
24172 case VPERM_2OP : {
24173 SDValue Src1 = Op.getOperand(1);
24174 SDValue Src2 = Op.getOperand(2);
24175
24176 // Swap Src1 and Src2 in the node creation
24177 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
24178 }
24179 case IFMA_OP:
24180 // NOTE: We need to swizzle the operands to pass the multiply operands
24181 // first.
24182 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24183 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
24184 case FPCLASSS: {
24185 SDValue Src1 = Op.getOperand(1);
24186 SDValue Imm = Op.getOperand(2);
24187 SDValue Mask = Op.getOperand(3);
24188 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
24189 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
24190 Subtarget, DAG);
24191 // Need to fill with zeros to ensure the bitcast will produce zeroes
24192 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
24193 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
24194 DAG.getConstant(0, dl, MVT::v8i1),
24195 FPclassMask, DAG.getIntPtrConstant(0, dl));
24196 return DAG.getBitcast(MVT::i8, Ins);
24197 }
24198
24199 case CMP_MASK_CC: {
24200 MVT MaskVT = Op.getSimpleValueType();
24201 SDValue CC = Op.getOperand(3);
24202 // We specify 2 possible opcodes for intrinsics with rounding modes.
24203 // First, we check if the intrinsic may have non-default rounding mode,
24204 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
24205 if (IntrData->Opc1 != 0) {
24206 SDValue Sae = Op.getOperand(4);
24207 if (isRoundModeSAE(Sae))
24208 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
24209 Op.getOperand(2), CC, Sae);
24210 if (!isRoundModeCurDirection(Sae))
24211 return SDValue();
24212 }
24213 //default rounding mode
24214 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
24215 {Op.getOperand(1), Op.getOperand(2), CC});
24216 }
24217 case CMP_MASK_SCALAR_CC: {
24218 SDValue Src1 = Op.getOperand(1);
24219 SDValue Src2 = Op.getOperand(2);
24220 SDValue CC = Op.getOperand(3);
24221 SDValue Mask = Op.getOperand(4);
24222
24223 SDValue Cmp;
24224 if (IntrData->Opc1 != 0) {
24225 SDValue Sae = Op.getOperand(5);
24226 if (isRoundModeSAE(Sae))
24227 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
24228 else if (!isRoundModeCurDirection(Sae))
24229 return SDValue();
24230 }
24231 //default rounding mode
24232 if (!Cmp.getNode())
24233 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
24234
24235 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
24236 Subtarget, DAG);
24237 // Need to fill with zeros to ensure the bitcast will produce zeroes
24238 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
24239 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
24240 DAG.getConstant(0, dl, MVT::v8i1),
24241 CmpMask, DAG.getIntPtrConstant(0, dl));
24242 return DAG.getBitcast(MVT::i8, Ins);
24243 }
24244 case COMI: { // Comparison intrinsics
24245 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
24246 SDValue LHS = Op.getOperand(1);
24247 SDValue RHS = Op.getOperand(2);
24248 // Some conditions require the operands to be swapped.
24249 if (CC == ISD::SETLT || CC == ISD::SETLE)
24250 std::swap(LHS, RHS);
24251
24252 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
24253 SDValue SetCC;
24254 switch (CC) {
24255 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
24256 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
24257 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
24258 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
24259 break;
24260 }
24261 case ISD::SETNE: { // (ZF = 1 or PF = 1)
24262 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
24263 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
24264 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
24265 break;
24266 }
24267 case ISD::SETGT: // (CF = 0 and ZF = 0)
24268 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
24269 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
24270 break;
24271 }
24272 case ISD::SETGE: // CF = 0
24273 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
24274 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
24275 break;
24276 default:
24277 llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24277)
;
24278 }
24279 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
24280 }
24281 case COMI_RM: { // Comparison intrinsics with Sae
24282 SDValue LHS = Op.getOperand(1);
24283 SDValue RHS = Op.getOperand(2);
24284 unsigned CondVal = Op.getConstantOperandVal(3);
24285 SDValue Sae = Op.getOperand(4);
24286
24287 SDValue FCmp;
24288 if (isRoundModeCurDirection(Sae))
24289 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
24290 DAG.getTargetConstant(CondVal, dl, MVT::i8));
24291 else if (isRoundModeSAE(Sae))
24292 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
24293 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
24294 else
24295 return SDValue();
24296 // Need to fill with zeros to ensure the bitcast will produce zeroes
24297 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
24298 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24299 DAG.getConstant(0, dl, MVT::v16i1),
24300 FCmp, DAG.getIntPtrConstant(0, dl));
24301 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
24302 DAG.getBitcast(MVT::i16, Ins));
24303 }
24304 case VSHIFT:
24305 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
24306 Op.getOperand(1), Op.getOperand(2), Subtarget,
24307 DAG);
24308 case COMPRESS_EXPAND_IN_REG: {
24309 SDValue Mask = Op.getOperand(3);
24310 SDValue DataToCompress = Op.getOperand(1);
24311 SDValue PassThru = Op.getOperand(2);
24312 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
24313 return Op.getOperand(1);
24314
24315 // Avoid false dependency.
24316 if (PassThru.isUndef())
24317 PassThru = DAG.getConstant(0, dl, VT);
24318
24319 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
24320 Mask);
24321 }
24322 case FIXUPIMM:
24323 case FIXUPIMM_MASKZ: {
24324 SDValue Src1 = Op.getOperand(1);
24325 SDValue Src2 = Op.getOperand(2);
24326 SDValue Src3 = Op.getOperand(3);
24327 SDValue Imm = Op.getOperand(4);
24328 SDValue Mask = Op.getOperand(5);
24329 SDValue Passthru = (IntrData->Type == FIXUPIMM)
24330 ? Src1
24331 : getZeroVector(VT, Subtarget, DAG, dl);
24332
24333 unsigned Opc = IntrData->Opc0;
24334 if (IntrData->Opc1 != 0) {
24335 SDValue Sae = Op.getOperand(6);
24336 if (isRoundModeSAE(Sae))
24337 Opc = IntrData->Opc1;
24338 else if (!isRoundModeCurDirection(Sae))
24339 return SDValue();
24340 }
24341
24342 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
24343
24344 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
24345 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
24346
24347 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
24348 }
24349 case ROUNDP: {
24350 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")((IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24350, __PRETTY_FUNCTION__))
;
24351 // Clear the upper bits of the rounding immediate so that the legacy
24352 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
24353 auto Round = cast<ConstantSDNode>(Op.getOperand(2));
24354 SDValue RoundingMode =
24355 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
24356 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24357 Op.getOperand(1), RoundingMode);
24358 }
24359 case ROUNDS: {
24360 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")((IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24360, __PRETTY_FUNCTION__))
;
24361 // Clear the upper bits of the rounding immediate so that the legacy
24362 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
24363 auto Round = cast<ConstantSDNode>(Op.getOperand(3));
24364 SDValue RoundingMode =
24365 DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);
24366 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24367 Op.getOperand(1), Op.getOperand(2), RoundingMode);
24368 }
24369 case BEXTRI: {
24370 assert(IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode")((IntrData->Opc0 == X86ISD::BEXTR && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("IntrData->Opc0 == X86ISD::BEXTR && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24370, __PRETTY_FUNCTION__))
;
24371
24372 // The control is a TargetConstant, but we need to convert it to a
24373 // ConstantSDNode.
24374 uint64_t Imm = Op.getConstantOperandVal(2);
24375 SDValue Control = DAG.getConstant(Imm, dl, Op.getValueType());
24376 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
24377 Op.getOperand(1), Control);
24378 }
24379 // ADC/ADCX/SBB
24380 case ADX: {
24381 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
24382 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
24383
24384 SDValue Res;
24385 // If the carry in is zero, then we should just use ADD/SUB instead of
24386 // ADC/SBB.
24387 if (isNullConstant(Op.getOperand(1))) {
24388 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
24389 Op.getOperand(3));
24390 } else {
24391 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
24392 DAG.getConstant(-1, dl, MVT::i8));
24393 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
24394 Op.getOperand(3), GenCF.getValue(1));
24395 }
24396 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
24397 SDValue Results[] = { SetCC, Res };
24398 return DAG.getMergeValues(Results, dl);
24399 }
24400 case CVTPD2PS_MASK:
24401 case CVTPD2DQ_MASK:
24402 case CVTQQ2PS_MASK:
24403 case TRUNCATE_TO_REG: {
24404 SDValue Src = Op.getOperand(1);
24405 SDValue PassThru = Op.getOperand(2);
24406 SDValue Mask = Op.getOperand(3);
24407
24408 if (isAllOnesConstant(Mask))
24409 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
24410
24411 MVT SrcVT = Src.getSimpleValueType();
24412 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
24413 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24414 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
24415 {Src, PassThru, Mask});
24416 }
24417 case CVTPS2PH_MASK: {
24418 SDValue Src = Op.getOperand(1);
24419 SDValue Rnd = Op.getOperand(2);
24420 SDValue PassThru = Op.getOperand(3);
24421 SDValue Mask = Op.getOperand(4);
24422
24423 if (isAllOnesConstant(Mask))
24424 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
24425
24426 MVT SrcVT = Src.getSimpleValueType();
24427 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
24428 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24429 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
24430 PassThru, Mask);
24431
24432 }
24433 case CVTNEPS2BF16_MASK: {
24434 SDValue Src = Op.getOperand(1);
24435 SDValue PassThru = Op.getOperand(2);
24436 SDValue Mask = Op.getOperand(3);
24437
24438 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
24439 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
24440
24441 // Break false dependency.
24442 if (PassThru.isUndef())
24443 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
24444
24445 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
24446 Mask);
24447 }
24448 default:
24449 break;
24450 }
24451 }
24452
24453 switch (IntNo) {
24454 default: return SDValue(); // Don't custom lower most intrinsics.
24455
24456 // ptest and testp intrinsics. The intrinsic these come from are designed to
24457 // return an integer value, not just an instruction so lower it to the ptest
24458 // or testp pattern and a setcc for the result.
24459 case Intrinsic::x86_avx512_ktestc_b:
24460 case Intrinsic::x86_avx512_ktestc_w:
24461 case Intrinsic::x86_avx512_ktestc_d:
24462 case Intrinsic::x86_avx512_ktestc_q:
24463 case Intrinsic::x86_avx512_ktestz_b:
24464 case Intrinsic::x86_avx512_ktestz_w:
24465 case Intrinsic::x86_avx512_ktestz_d:
24466 case Intrinsic::x86_avx512_ktestz_q:
24467 case Intrinsic::x86_sse41_ptestz:
24468 case Intrinsic::x86_sse41_ptestc:
24469 case Intrinsic::x86_sse41_ptestnzc:
24470 case Intrinsic::x86_avx_ptestz_256:
24471 case Intrinsic::x86_avx_ptestc_256:
24472 case Intrinsic::x86_avx_ptestnzc_256:
24473 case Intrinsic::x86_avx_vtestz_ps:
24474 case Intrinsic::x86_avx_vtestc_ps:
24475 case Intrinsic::x86_avx_vtestnzc_ps:
24476 case Intrinsic::x86_avx_vtestz_pd:
24477 case Intrinsic::x86_avx_vtestc_pd:
24478 case Intrinsic::x86_avx_vtestnzc_pd:
24479 case Intrinsic::x86_avx_vtestz_ps_256:
24480 case Intrinsic::x86_avx_vtestc_ps_256:
24481 case Intrinsic::x86_avx_vtestnzc_ps_256:
24482 case Intrinsic::x86_avx_vtestz_pd_256:
24483 case Intrinsic::x86_avx_vtestc_pd_256:
24484 case Intrinsic::x86_avx_vtestnzc_pd_256: {
24485 unsigned TestOpc = X86ISD::PTEST;
24486 X86::CondCode X86CC;
24487 switch (IntNo) {
24488 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24488)
;
24489 case Intrinsic::x86_avx512_ktestc_b:
24490 case Intrinsic::x86_avx512_ktestc_w:
24491 case Intrinsic::x86_avx512_ktestc_d:
24492 case Intrinsic::x86_avx512_ktestc_q:
24493 // CF = 1
24494 TestOpc = X86ISD::KTEST;
24495 X86CC = X86::COND_B;
24496 break;
24497 case Intrinsic::x86_avx512_ktestz_b:
24498 case Intrinsic::x86_avx512_ktestz_w:
24499 case Intrinsic::x86_avx512_ktestz_d:
24500 case Intrinsic::x86_avx512_ktestz_q:
24501 TestOpc = X86ISD::KTEST;
24502 X86CC = X86::COND_E;
24503 break;
24504 case Intrinsic::x86_avx_vtestz_ps:
24505 case Intrinsic::x86_avx_vtestz_pd:
24506 case Intrinsic::x86_avx_vtestz_ps_256:
24507 case Intrinsic::x86_avx_vtestz_pd_256:
24508 TestOpc = X86ISD::TESTP;
24509 LLVM_FALLTHROUGH[[gnu::fallthrough]];
24510 case Intrinsic::x86_sse41_ptestz:
24511 case Intrinsic::x86_avx_ptestz_256:
24512 // ZF = 1
24513 X86CC = X86::COND_E;
24514 break;
24515 case Intrinsic::x86_avx_vtestc_ps:
24516 case Intrinsic::x86_avx_vtestc_pd:
24517 case Intrinsic::x86_avx_vtestc_ps_256:
24518 case Intrinsic::x86_avx_vtestc_pd_256:
24519 TestOpc = X86ISD::TESTP;
24520 LLVM_FALLTHROUGH[[gnu::fallthrough]];
24521 case Intrinsic::x86_sse41_ptestc:
24522 case Intrinsic::x86_avx_ptestc_256:
24523 // CF = 1
24524 X86CC = X86::COND_B;
24525 break;
24526 case Intrinsic::x86_avx_vtestnzc_ps:
24527 case Intrinsic::x86_avx_vtestnzc_pd:
24528 case Intrinsic::x86_avx_vtestnzc_ps_256:
24529 case Intrinsic::x86_avx_vtestnzc_pd_256:
24530 TestOpc = X86ISD::TESTP;
24531 LLVM_FALLTHROUGH[[gnu::fallthrough]];
24532 case Intrinsic::x86_sse41_ptestnzc:
24533 case Intrinsic::x86_avx_ptestnzc_256:
24534 // ZF and CF = 0
24535 X86CC = X86::COND_A;
24536 break;
24537 }
24538
24539 SDValue LHS = Op.getOperand(1);
24540 SDValue RHS = Op.getOperand(2);
24541 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
24542 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
24543 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
24544 }
24545
24546 case Intrinsic::x86_sse42_pcmpistria128:
24547 case Intrinsic::x86_sse42_pcmpestria128:
24548 case Intrinsic::x86_sse42_pcmpistric128:
24549 case Intrinsic::x86_sse42_pcmpestric128:
24550 case Intrinsic::x86_sse42_pcmpistrio128:
24551 case Intrinsic::x86_sse42_pcmpestrio128:
24552 case Intrinsic::x86_sse42_pcmpistris128:
24553 case Intrinsic::x86_sse42_pcmpestris128:
24554 case Intrinsic::x86_sse42_pcmpistriz128:
24555 case Intrinsic::x86_sse42_pcmpestriz128: {
24556 unsigned Opcode;
24557 X86::CondCode X86CC;
24558 switch (IntNo) {
24559 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24559)
; // Can't reach here.
24560 case Intrinsic::x86_sse42_pcmpistria128:
24561 Opcode = X86ISD::PCMPISTR;
24562 X86CC = X86::COND_A;
24563 break;
24564 case Intrinsic::x86_sse42_pcmpestria128:
24565 Opcode = X86ISD::PCMPESTR;
24566 X86CC = X86::COND_A;
24567 break;
24568 case Intrinsic::x86_sse42_pcmpistric128:
24569 Opcode = X86ISD::PCMPISTR;
24570 X86CC = X86::COND_B;
24571 break;
24572 case Intrinsic::x86_sse42_pcmpestric128:
24573 Opcode = X86ISD::PCMPESTR;
24574 X86CC = X86::COND_B;
24575 break;
24576 case Intrinsic::x86_sse42_pcmpistrio128:
24577 Opcode = X86ISD::PCMPISTR;
24578 X86CC = X86::COND_O;
24579 break;
24580 case Intrinsic::x86_sse42_pcmpestrio128:
24581 Opcode = X86ISD::PCMPESTR;
24582 X86CC = X86::COND_O;
24583 break;
24584 case Intrinsic::x86_sse42_pcmpistris128:
24585 Opcode = X86ISD::PCMPISTR;
24586 X86CC = X86::COND_S;
24587 break;
24588 case Intrinsic::x86_sse42_pcmpestris128:
24589 Opcode = X86ISD::PCMPESTR;
24590 X86CC = X86::COND_S;
24591 break;
24592 case Intrinsic::x86_sse42_pcmpistriz128:
24593 Opcode = X86ISD::PCMPISTR;
24594 X86CC = X86::COND_E;
24595 break;
24596 case Intrinsic::x86_sse42_pcmpestriz128:
24597 Opcode = X86ISD::PCMPESTR;
24598 X86CC = X86::COND_E;
24599 break;
24600 }
24601 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
24602 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
24603 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
24604 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
24605 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
24606 }
24607
24608 case Intrinsic::x86_sse42_pcmpistri128:
24609 case Intrinsic::x86_sse42_pcmpestri128: {
24610 unsigned Opcode;
24611 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
24612 Opcode = X86ISD::PCMPISTR;
24613 else
24614 Opcode = X86ISD::PCMPESTR;
24615
24616 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
24617 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
24618 return DAG.getNode(Opcode, dl, VTs, NewOps);
24619 }
24620
24621 case Intrinsic::x86_sse42_pcmpistrm128:
24622 case Intrinsic::x86_sse42_pcmpestrm128: {
24623 unsigned Opcode;
24624 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
24625 Opcode = X86ISD::PCMPISTR;
24626 else
24627 Opcode = X86ISD::PCMPESTR;
24628
24629 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
24630 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
24631 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
24632 }
24633
24634 case Intrinsic::eh_sjlj_lsda: {
24635 MachineFunction &MF = DAG.getMachineFunction();
24636 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24637 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
24638 auto &Context = MF.getMMI().getContext();
24639 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
24640 Twine(MF.getFunctionNumber()));
24641 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
24642 DAG.getMCSymbol(S, PtrVT));
24643 }
24644
24645 case Intrinsic::x86_seh_lsda: {
24646 // Compute the symbol for the LSDA. We know it'll get emitted later.
24647 MachineFunction &MF = DAG.getMachineFunction();
24648 SDValue Op1 = Op.getOperand(1);
24649 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
24650 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
24651 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
24652
24653 // Generate a simple absolute symbol reference. This intrinsic is only
24654 // supported on 32-bit Windows, which isn't PIC.
24655 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
24656 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
24657 }
24658
24659 case Intrinsic::eh_recoverfp: {
24660 SDValue FnOp = Op.getOperand(1);
24661 SDValue IncomingFPOp = Op.getOperand(2);
24662 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
24663 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
24664 if (!Fn)
24665 report_fatal_error(
24666 "llvm.eh.recoverfp must take a function as the first argument");
24667 return recoverFramePointer(DAG, Fn, IncomingFPOp);
24668 }
24669
24670 case Intrinsic::localaddress: {
24671 // Returns one of the stack, base, or frame pointer registers, depending on
24672 // which is used to reference local variables.
24673 MachineFunction &MF = DAG.getMachineFunction();
24674 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24675 unsigned Reg;
24676 if (RegInfo->hasBasePointer(MF))
24677 Reg = RegInfo->getBaseRegister();
24678 else { // Handles the SP or FP case.
24679 bool CantUseFP = RegInfo->needsStackRealignment(MF);
24680 if (CantUseFP)
24681 Reg = RegInfo->getPtrSizedStackRegister(MF);
24682 else
24683 Reg = RegInfo->getPtrSizedFrameRegister(MF);
24684 }
24685 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
24686 }
24687
24688 case Intrinsic::x86_avx512_vp2intersect_q_512:
24689 case Intrinsic::x86_avx512_vp2intersect_q_256:
24690 case Intrinsic::x86_avx512_vp2intersect_q_128:
24691 case Intrinsic::x86_avx512_vp2intersect_d_512:
24692 case Intrinsic::x86_avx512_vp2intersect_d_256:
24693 case Intrinsic::x86_avx512_vp2intersect_d_128: {
24694 MVT MaskVT = Op.getSimpleValueType();
24695
24696 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
24697 SDLoc DL(Op);
24698
24699 SDValue Operation =
24700 DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,
24701 Op->getOperand(1), Op->getOperand(2));
24702
24703 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
24704 MaskVT, Operation);
24705 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
24706 MaskVT, Operation);
24707 return DAG.getMergeValues({Result0, Result1}, DL);
24708 }
24709 case Intrinsic::x86_mmx_pslli_w:
24710 case Intrinsic::x86_mmx_pslli_d:
24711 case Intrinsic::x86_mmx_pslli_q:
24712 case Intrinsic::x86_mmx_psrli_w:
24713 case Intrinsic::x86_mmx_psrli_d:
24714 case Intrinsic::x86_mmx_psrli_q:
24715 case Intrinsic::x86_mmx_psrai_w:
24716 case Intrinsic::x86_mmx_psrai_d: {
24717 SDLoc DL(Op);
24718 SDValue ShAmt = Op.getOperand(2);
24719 // If the argument is a constant, convert it to a target constant.
24720 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
24721 // Clamp out of bounds shift amounts since they will otherwise be masked
24722 // to 8-bits which may make it no longer out of bounds.
24723 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
24724 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
24725 Op.getOperand(0), Op.getOperand(1),
24726 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
24727 }
24728
24729 unsigned NewIntrinsic;
24730 switch (IntNo) {
24731 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24731)
; // Can't reach here.
24732 case Intrinsic::x86_mmx_pslli_w:
24733 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
24734 break;
24735 case Intrinsic::x86_mmx_pslli_d:
24736 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
24737 break;
24738 case Intrinsic::x86_mmx_pslli_q:
24739 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
24740 break;
24741 case Intrinsic::x86_mmx_psrli_w:
24742 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
24743 break;
24744 case Intrinsic::x86_mmx_psrli_d:
24745 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
24746 break;
24747 case Intrinsic::x86_mmx_psrli_q:
24748 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
24749 break;
24750 case Intrinsic::x86_mmx_psrai_w:
24751 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
24752 break;
24753 case Intrinsic::x86_mmx_psrai_d:
24754 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
24755 break;
24756 }
24757
24758 // The vector shift intrinsics with scalars uses 32b shift amounts but
24759 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
24760 // MMX register.
24761 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
24762 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
24763 DAG.getConstant(NewIntrinsic, DL, MVT::i32),
24764 Op.getOperand(1), ShAmt);
24765
24766 }
24767 }
24768}
24769
24770static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
24771 SDValue Src, SDValue Mask, SDValue Base,
24772 SDValue Index, SDValue ScaleOp, SDValue Chain,
24773 const X86Subtarget &Subtarget) {
24774 SDLoc dl(Op);
24775 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
24776 // Scale must be constant.
24777 if (!C)
24778 return SDValue();
24779 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24780 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
24781 TLI.getPointerTy(DAG.getDataLayout()));
24782 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
24783 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
24784 // If source is undef or we know it won't be used, use a zero vector
24785 // to break register dependency.
24786 // TODO: use undef instead and let BreakFalseDeps deal with it?
24787 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
24788 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
24789
24790 // Cast mask to an integer type.
24791 Mask = DAG.getBitcast(MaskVT, Mask);
24792
24793 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
24794
24795 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
24796 SDValue Res =
24797 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
24798 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
24799 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
24800}
24801
24802static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
24803 SDValue Src, SDValue Mask, SDValue Base,
24804 SDValue Index, SDValue ScaleOp, SDValue Chain,
24805 const X86Subtarget &Subtarget) {
24806 MVT VT = Op.getSimpleValueType();
24807 SDLoc dl(Op);
24808 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
24809 // Scale must be constant.
24810 if (!C)
24811 return SDValue();
24812 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24813 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
24814 TLI.getPointerTy(DAG.getDataLayout()));
24815 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
24816 VT.getVectorNumElements());
24817 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
24818
24819 // We support two versions of the gather intrinsics. One with scalar mask and
24820 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
24821 if (Mask.getValueType() != MaskVT)
24822 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24823
24824 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
24825 // If source is undef or we know it won't be used, use a zero vector
24826 // to break register dependency.
24827 // TODO: use undef instead and let BreakFalseDeps deal with it?
24828 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
24829 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
24830
24831 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
24832
24833 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
24834 SDValue Res =
24835 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
24836 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
24837 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
24838}
24839
24840static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
24841 SDValue Src, SDValue Mask, SDValue Base,
24842 SDValue Index, SDValue ScaleOp, SDValue Chain,
24843 const X86Subtarget &Subtarget) {
24844 SDLoc dl(Op);
24845 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
24846 // Scale must be constant.
24847 if (!C)
24848 return SDValue();
24849 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24850 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
24851 TLI.getPointerTy(DAG.getDataLayout()));
24852 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
24853 Src.getSimpleValueType().getVectorNumElements());
24854 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
24855
24856 // We support two versions of the scatter intrinsics. One with scalar mask and
24857 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
24858 if (Mask.getValueType() != MaskVT)
24859 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24860
24861 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
24862
24863 SDVTList VTs = DAG.getVTList(MVT::Other);
24864 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
24865 SDValue Res =
24866 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
24867 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
24868 return Res;
24869}
24870
24871static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
24872 SDValue Mask, SDValue Base, SDValue Index,
24873 SDValue ScaleOp, SDValue Chain,
24874 const X86Subtarget &Subtarget) {
24875 SDLoc dl(Op);
24876 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
24877 // Scale must be constant.
24878 if (!C)
24879 return SDValue();
24880 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24881 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
24882 TLI.getPointerTy(DAG.getDataLayout()));
24883 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
24884 SDValue Segment = DAG.getRegister(0, MVT::i32);
24885 MVT MaskVT =
24886 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
24887 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
24888 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
24889 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
24890 return SDValue(Res, 0);
24891}
24892
24893/// Handles the lowering of builtin intrinsics with chain that return their
24894/// value into registers EDX:EAX.
24895/// If operand ScrReg is a valid register identifier, then operand 2 of N is
24896/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
24897/// TargetOpcode.
24898/// Returns a Glue value which can be used to add extra copy-from-reg if the
24899/// expanded intrinsics implicitly defines extra registers (i.e. not just
24900/// EDX:EAX).
24901static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
24902 SelectionDAG &DAG,
24903 unsigned TargetOpcode,
24904 unsigned SrcReg,
24905 const X86Subtarget &Subtarget,
24906 SmallVectorImpl<SDValue> &Results) {
24907 SDValue Chain = N->getOperand(0);
24908 SDValue Glue;
24909
24910 if (SrcReg) {
24911 assert(N->getNumOperands() == 3 && "Unexpected number of operands!")((N->getNumOperands() == 3 && "Unexpected number of operands!"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24911, __PRETTY_FUNCTION__))
;
24912 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
24913 Glue = Chain.getValue(1);
24914 }
24915
24916 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24917 SDValue N1Ops[] = {Chain, Glue};
24918 SDNode *N1 = DAG.getMachineNode(
24919 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
24920 Chain = SDValue(N1, 0);
24921
24922 // Reads the content of XCR and returns it in registers EDX:EAX.
24923 SDValue LO, HI;
24924 if (Subtarget.is64Bit()) {
24925 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
24926 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
24927 LO.getValue(2));
24928 } else {
24929 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
24930 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
24931 LO.getValue(2));
24932 }
24933 Chain = HI.getValue(1);
24934 Glue = HI.getValue(2);
24935
24936 if (Subtarget.is64Bit()) {
24937 // Merge the two 32-bit values into a 64-bit one.
24938 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
24939 DAG.getConstant(32, DL, MVT::i8));
24940 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
24941 Results.push_back(Chain);
24942 return Glue;
24943 }
24944
24945 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
24946 SDValue Ops[] = { LO, HI };
24947 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
24948 Results.push_back(Pair);
24949 Results.push_back(Chain);
24950 return Glue;
24951}
24952
24953/// Handles the lowering of builtin intrinsics that read the time stamp counter
24954/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
24955/// READCYCLECOUNTER nodes.
24956static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
24957 SelectionDAG &DAG,
24958 const X86Subtarget &Subtarget,
24959 SmallVectorImpl<SDValue> &Results) {
24960 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
24961 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
24962 // and the EAX register is loaded with the low-order 32 bits.
24963 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
24964 /* NoRegister */0, Subtarget,
24965 Results);
24966 if (Opcode != X86::RDTSCP)
24967 return;
24968
24969 SDValue Chain = Results[1];
24970 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
24971 // the ECX register. Add 'ecx' explicitly to the chain.
24972 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
24973 Results[1] = ecx;
24974 Results.push_back(ecx.getValue(1));
24975}
24976
24977static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
24978 SelectionDAG &DAG) {
24979 SmallVector<SDValue, 3> Results;
24980 SDLoc DL(Op);
24981 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
24982 Results);
24983 return DAG.getMergeValues(Results, DL);
24984}
24985
24986static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
24987 MachineFunction &MF = DAG.getMachineFunction();
24988 SDValue Chain = Op.getOperand(0);
24989 SDValue RegNode = Op.getOperand(2);
24990 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
24991 if (!EHInfo)
24992 report_fatal_error("EH registrations only live in functions using WinEH");
24993
24994 // Cast the operand to an alloca, and remember the frame index.
24995 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
24996 if (!FINode)
24997 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
24998 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
24999
25000 // Return the chain operand without making any DAG nodes.
25001 return Chain;
25002}
25003
25004static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
25005 MachineFunction &MF = DAG.getMachineFunction();
25006 SDValue Chain = Op.getOperand(0);
25007 SDValue EHGuard = Op.getOperand(2);
25008 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
25009 if (!EHInfo)
25010 report_fatal_error("EHGuard only live in functions using WinEH");
25011
25012 // Cast the operand to an alloca, and remember the frame index.
25013 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
25014 if (!FINode)
25015 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
25016 EHInfo->EHGuardFrameIndex = FINode->getIndex();
25017
25018 // Return the chain operand without making any DAG nodes.
25019 return Chain;
25020}
25021
25022/// Emit Truncating Store with signed or unsigned saturation.
25023static SDValue
25024EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
25025 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
25026 SelectionDAG &DAG) {
25027 SDVTList VTs = DAG.getVTList(MVT::Other);
25028 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
25029 SDValue Ops[] = { Chain, Val, Ptr, Undef };
25030 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
25031 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
25032}
25033
25034/// Emit Masked Truncating Store with signed or unsigned saturation.
25035static SDValue
25036EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
25037 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
25038 MachineMemOperand *MMO, SelectionDAG &DAG) {
25039 SDVTList VTs = DAG.getVTList(MVT::Other);
25040 SDValue Ops[] = { Chain, Val, Ptr, Mask };
25041 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
25042 return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
25043}
25044
25045static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
25046 SelectionDAG &DAG) {
25047 unsigned IntNo = Op.getConstantOperandVal(1);
25048 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
25049 if (!IntrData) {
25050 switch (IntNo) {
25051 case llvm::Intrinsic::x86_seh_ehregnode:
25052 return MarkEHRegistrationNode(Op, DAG);
25053 case llvm::Intrinsic::x86_seh_ehguard:
25054 return MarkEHGuard(Op, DAG);
25055 case llvm::Intrinsic::x86_rdpkru: {
25056 SDLoc dl(Op);
25057 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
25058 // Create a RDPKRU node and pass 0 to the ECX parameter.
25059 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
25060 DAG.getConstant(0, dl, MVT::i32));
25061 }
25062 case llvm::Intrinsic::x86_wrpkru: {
25063 SDLoc dl(Op);
25064 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
25065 // to the EDX and ECX parameters.
25066 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
25067 Op.getOperand(0), Op.getOperand(2),
25068 DAG.getConstant(0, dl, MVT::i32),
25069 DAG.getConstant(0, dl, MVT::i32));
25070 }
25071 case llvm::Intrinsic::x86_flags_read_u32:
25072 case llvm::Intrinsic::x86_flags_read_u64:
25073 case llvm::Intrinsic::x86_flags_write_u32:
25074 case llvm::Intrinsic::x86_flags_write_u64: {
25075 // We need a frame pointer because this will get lowered to a PUSH/POP
25076 // sequence.
25077 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
25078 MFI.setHasCopyImplyingStackAdjustment(true);
25079 // Don't do anything here, we will expand these intrinsics out later
25080 // during FinalizeISel in EmitInstrWithCustomInserter.
25081 return Op;
25082 }
25083 case Intrinsic::x86_lwpins32:
25084 case Intrinsic::x86_lwpins64:
25085 case Intrinsic::x86_umwait:
25086 case Intrinsic::x86_tpause: {
25087 SDLoc dl(Op);
25088 SDValue Chain = Op->getOperand(0);
25089 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
25090 unsigned Opcode;
25091
25092 switch (IntNo) {
25093 default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25093)
;
25094 case Intrinsic::x86_umwait:
25095 Opcode = X86ISD::UMWAIT;
25096 break;
25097 case Intrinsic::x86_tpause:
25098 Opcode = X86ISD::TPAUSE;
25099 break;
25100 case Intrinsic::x86_lwpins32:
25101 case Intrinsic::x86_lwpins64:
25102 Opcode = X86ISD::LWPINS;
25103 break;
25104 }
25105
25106 SDValue Operation =
25107 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
25108 Op->getOperand(3), Op->getOperand(4));
25109 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
25110 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
25111 Operation.getValue(1));
25112 }
25113 case Intrinsic::x86_enqcmd:
25114 case Intrinsic::x86_enqcmds: {
25115 SDLoc dl(Op);
25116 SDValue Chain = Op.getOperand(0);
25117 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
25118 unsigned Opcode;
25119 switch (IntNo) {
25120 default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25120)
;
25121 case Intrinsic::x86_enqcmd:
25122 Opcode = X86ISD::ENQCMD;
25123 break;
25124 case Intrinsic::x86_enqcmds:
25125 Opcode = X86ISD::ENQCMDS;
25126 break;
25127 }
25128 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
25129 Op.getOperand(3));
25130 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
25131 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
25132 Operation.getValue(1));
25133 }
25134 }
25135 return SDValue();
25136 }
25137
25138 SDLoc dl(Op);
25139 switch(IntrData->Type) {
25140 default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25140)
;
25141 case RDSEED:
25142 case RDRAND: {
25143 // Emit the node with the right value type.
25144 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
25145 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
25146
25147 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
25148 // Otherwise return the value from Rand, which is always 0, casted to i32.
25149 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
25150 DAG.getConstant(1, dl, Op->getValueType(1)),
25151 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
25152 SDValue(Result.getNode(), 1)};
25153 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
25154
25155 // Return { result, isValid, chain }.
25156 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
25157 SDValue(Result.getNode(), 2));
25158 }
25159 case GATHER_AVX2: {
25160 SDValue Chain = Op.getOperand(0);
25161 SDValue Src = Op.getOperand(2);
25162 SDValue Base = Op.getOperand(3);
25163 SDValue Index = Op.getOperand(4);
25164 SDValue Mask = Op.getOperand(5);
25165 SDValue Scale = Op.getOperand(6);
25166 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
25167 Scale, Chain, Subtarget);
25168 }
25169 case GATHER: {
25170 //gather(v1, mask, index, base, scale);
25171 SDValue Chain = Op.getOperand(0);
25172 SDValue Src = Op.getOperand(2);
25173 SDValue Base = Op.getOperand(3);
25174 SDValue Index = Op.getOperand(4);
25175 SDValue Mask = Op.getOperand(5);
25176 SDValue Scale = Op.getOperand(6);
25177 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
25178 Chain, Subtarget);
25179 }
25180 case SCATTER: {
25181 //scatter(base, mask, index, v1, scale);
25182 SDValue Chain = Op.getOperand(0);
25183 SDValue Base = Op.getOperand(2);
25184 SDValue Mask = Op.getOperand(3);
25185 SDValue Index = Op.getOperand(4);
25186 SDValue Src = Op.getOperand(5);
25187 SDValue Scale = Op.getOperand(6);
25188 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
25189 Scale, Chain, Subtarget);
25190 }
25191 case PREFETCH: {
25192 const APInt &HintVal = Op.getConstantOperandAPInt(6);
25193 assert((HintVal == 2 || HintVal == 3) &&(((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"
) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25194, __PRETTY_FUNCTION__))
25194 "Wrong prefetch hint in intrinsic: should be 2 or 3")(((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3"
) ? static_cast<void> (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25194, __PRETTY_FUNCTION__))
;
25195 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
25196 SDValue Chain = Op.getOperand(0);
25197 SDValue Mask = Op.getOperand(2);
25198 SDValue Index = Op.getOperand(3);
25199 SDValue Base = Op.getOperand(4);
25200 SDValue Scale = Op.getOperand(5);
25201 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
25202 Subtarget);
25203 }
25204 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
25205 case RDTSC: {
25206 SmallVector<SDValue, 2> Results;
25207 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
25208 Results);
25209 return DAG.getMergeValues(Results, dl);
25210 }
25211 // Read Performance Monitoring Counters.
25212 case RDPMC:
25213 // GetExtended Control Register.
25214 case XGETBV: {
25215 SmallVector<SDValue, 2> Results;
25216
25217 // RDPMC uses ECX to select the index of the performance counter to read.
25218 // XGETBV uses ECX to select the index of the XCR register to return.
25219 // The result is stored into registers EDX:EAX.
25220 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
25221 Subtarget, Results);
25222 return DAG.getMergeValues(Results, dl);
25223 }
25224 // XTEST intrinsics.
25225 case XTEST: {
25226 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
25227 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
25228
25229 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
25230 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
25231 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
25232 Ret, SDValue(InTrans.getNode(), 1));
25233 }
25234 case TRUNCATE_TO_MEM_VI8:
25235 case TRUNCATE_TO_MEM_VI16:
25236 case TRUNCATE_TO_MEM_VI32: {
25237 SDValue Mask = Op.getOperand(4);
25238 SDValue DataToTruncate = Op.getOperand(3);
25239 SDValue Addr = Op.getOperand(2);
25240 SDValue Chain = Op.getOperand(0);
25241
25242 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
25243 assert(MemIntr && "Expected MemIntrinsicSDNode!")((MemIntr && "Expected MemIntrinsicSDNode!") ? static_cast
<void> (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25243, __PRETTY_FUNCTION__))
;
25244
25245 EVT MemVT = MemIntr->getMemoryVT();
25246
25247 uint16_t TruncationOp = IntrData->Opc0;
25248 switch (TruncationOp) {
25249 case X86ISD::VTRUNC: {
25250 if (isAllOnesConstant(Mask)) // return just a truncate store
25251 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
25252 MemIntr->getMemOperand());
25253
25254 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
25255 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25256 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
25257
25258 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
25259 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
25260 true /* truncating */);
25261 }
25262 case X86ISD::VTRUNCUS:
25263 case X86ISD::VTRUNCS: {
25264 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
25265 if (isAllOnesConstant(Mask))
25266 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
25267 MemIntr->getMemOperand(), DAG);
25268
25269 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
25270 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25271
25272 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
25273 VMask, MemVT, MemIntr->getMemOperand(), DAG);
25274 }
25275 default:
25276 llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25276)
;
25277 }
25278 }
25279 }
25280}
25281
25282SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
25283 SelectionDAG &DAG) const {
25284 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
25285 MFI.setReturnAddressIsTaken(true);
25286
25287 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
25288 return SDValue();
25289
25290 unsigned Depth = Op.getConstantOperandVal(0);
25291 SDLoc dl(Op);
25292 EVT PtrVT = getPointerTy(DAG.getDataLayout());
25293
25294 if (Depth > 0) {
25295 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
25296 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25297 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
25298 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
25299 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
25300 MachinePointerInfo());
25301 }
25302
25303 // Just load the return address.
25304 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
25305 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
25306 MachinePointerInfo());
25307}
25308
25309SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
25310 SelectionDAG &DAG) const {
25311 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
25312 return getReturnAddressFrameIndex(DAG);
25313}
25314
25315SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
25316 MachineFunction &MF = DAG.getMachineFunction();
25317 MachineFrameInfo &MFI = MF.getFrameInfo();
25318 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25319 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25320 EVT VT = Op.getValueType();
25321
25322 MFI.setFrameAddressIsTaken(true);
25323
25324 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
25325 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
25326 // is not possible to crawl up the stack without looking at the unwind codes
25327 // simultaneously.
25328 int FrameAddrIndex = FuncInfo->getFAIndex();
25329 if (!FrameAddrIndex) {
25330 // Set up a frame object for the return address.
25331 unsigned SlotSize = RegInfo->getSlotSize();
25332 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
25333 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
25334 FuncInfo->setFAIndex(FrameAddrIndex);
25335 }
25336 return DAG.getFrameIndex(FrameAddrIndex, VT);
25337 }
25338
25339 unsigned FrameReg =
25340 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
25341 SDLoc dl(Op); // FIXME probably not meaningful
25342 unsigned Depth = Op.getConstantOperandVal(0);
25343 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25345, __PRETTY_FUNCTION__))
25344 (FrameReg == X86::EBP && VT == MVT::i32)) &&((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25345, __PRETTY_FUNCTION__))
25345 "Invalid Frame Register!")((((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg
== X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25345, __PRETTY_FUNCTION__))
;
25346 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
25347 while (Depth--)
25348 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
25349 MachinePointerInfo());
25350 return FrameAddr;
25351}
25352
25353// FIXME? Maybe this could be a TableGen attribute on some registers and
25354// this table could be generated automatically from RegInfo.
25355Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
25356 const MachineFunction &MF) const {
25357 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25358
25359 Register Reg = StringSwitch<unsigned>(RegName)
25360 .Case("esp", X86::ESP)
25361 .Case("rsp", X86::RSP)
25362 .Case("ebp", X86::EBP)
25363 .Case("rbp", X86::RBP)
25364 .Default(0);
25365
25366 if (Reg == X86::EBP || Reg == X86::RBP) {
25367 if (!TFI.hasFP(MF))
25368 report_fatal_error("register " + StringRef(RegName) +
25369 " is allocatable: function has no frame pointer");
25370#ifndef NDEBUG
25371 else {
25372 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25373 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
25374 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25375, __PRETTY_FUNCTION__))
25375 "Invalid Frame Register!")(((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25375, __PRETTY_FUNCTION__))
;
25376 }
25377#endif
25378 }
25379
25380 if (Reg)
25381 return Reg;
25382
25383 report_fatal_error("Invalid register name global variable");
25384}
25385
25386SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
25387 SelectionDAG &DAG) const {
25388 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25389 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
25390}
25391
25392unsigned X86TargetLowering::getExceptionPointerRegister(
25393 const Constant *PersonalityFn) const {
25394 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
25395 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
25396
25397 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
25398}
25399
25400unsigned X86TargetLowering::getExceptionSelectorRegister(
25401 const Constant *PersonalityFn) const {
25402 // Funclet personalities don't use selectors (the runtime does the selection).
25403 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))((!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn
))) ? static_cast<void> (0) : __assert_fail ("!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25403, __PRETTY_FUNCTION__))
;
25404 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
25405}
25406
25407bool X86TargetLowering::needsFixedCatchObjects() const {
25408 return Subtarget.isTargetWin64();
25409}
25410
25411SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
25412 SDValue Chain = Op.getOperand(0);
25413 SDValue Offset = Op.getOperand(1);
25414 SDValue Handler = Op.getOperand(2);
25415 SDLoc dl (Op);
25416
25417 EVT PtrVT = getPointerTy(DAG.getDataLayout());
25418 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25419 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
25420 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25422, __PRETTY_FUNCTION__))
25421 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25422, __PRETTY_FUNCTION__))
25422 "Invalid Frame Register!")((((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg
== X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"
) ? static_cast<void> (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25422, __PRETTY_FUNCTION__))
;
25423 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
25424 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
25425
25426 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
25427 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
25428 dl));
25429 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
25430 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
25431 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
25432
25433 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
25434 DAG.getRegister(StoreAddrReg, PtrVT));
25435}
25436
25437SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
25438 SelectionDAG &DAG) const {
25439 SDLoc DL(Op);
25440 // If the subtarget is not 64bit, we may need the global base reg
25441 // after isel expand pseudo, i.e., after CGBR pass ran.
25442 // Therefore, ask for the GlobalBaseReg now, so that the pass
25443 // inserts the code for us in case we need it.
25444 // Otherwise, we will end up in a situation where we will
25445 // reference a virtual register that is not defined!
25446 if (!Subtarget.is64Bit()) {
25447 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25448 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
25449 }
25450 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
25451 DAG.getVTList(MVT::i32, MVT::Other),
25452 Op.getOperand(0), Op.getOperand(1));
25453}
25454
25455SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
25456 SelectionDAG &DAG) const {
25457 SDLoc DL(Op);
25458 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
25459 Op.getOperand(0), Op.getOperand(1));
25460}
25461
25462SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
25463 SelectionDAG &DAG) const {
25464 SDLoc DL(Op);
25465 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
25466 Op.getOperand(0));
25467}
25468
25469static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
25470 return Op.getOperand(0);
25471}
25472
25473SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
25474 SelectionDAG &DAG) const {
25475 SDValue Root = Op.getOperand(0);
25476 SDValue Trmp = Op.getOperand(1); // trampoline
25477 SDValue FPtr = Op.getOperand(2); // nested function
25478 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
25479 SDLoc dl (Op);
25480
25481 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25482 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25483
25484 if (Subtarget.is64Bit()) {
25485 SDValue OutChains[6];
25486
25487 // Large code-model.
25488 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
25489 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
25490
25491 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
25492 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
25493
25494 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
25495
25496 // Load the pointer to the nested function into R11.
25497 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
25498 SDValue Addr = Trmp;
25499 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
25500 Addr, MachinePointerInfo(TrmpAddr));
25501
25502 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
25503 DAG.getConstant(2, dl, MVT::i64));
25504 OutChains[1] =
25505 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
25506 /* Alignment = */ 2);
25507
25508 // Load the 'nest' parameter value into R10.
25509 // R10 is specified in X86CallingConv.td
25510 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
25511 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
25512 DAG.getConstant(10, dl, MVT::i64));
25513 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
25514 Addr, MachinePointerInfo(TrmpAddr, 10));
25515
25516 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
25517 DAG.getConstant(12, dl, MVT::i64));
25518 OutChains[3] =
25519 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
25520 /* Alignment = */ 2);
25521
25522 // Jump to the nested function.
25523 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
25524 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
25525 DAG.getConstant(20, dl, MVT::i64));
25526 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
25527 Addr, MachinePointerInfo(TrmpAddr, 20));
25528
25529 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
25530 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
25531 DAG.getConstant(22, dl, MVT::i64));
25532 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
25533 Addr, MachinePointerInfo(TrmpAddr, 22));
25534
25535 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
25536 } else {
25537 const Function *Func =
25538 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
25539 CallingConv::ID CC = Func->getCallingConv();
25540 unsigned NestReg;
25541
25542 switch (CC) {
25543 default:
25544 llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25544)
;
25545 case CallingConv::C:
25546 case CallingConv::X86_StdCall: {
25547 // Pass 'nest' parameter in ECX.
25548 // Must be kept in sync with X86CallingConv.td
25549 NestReg = X86::ECX;
25550
25551 // Check that ECX wasn't needed by an 'inreg' parameter.
25552 FunctionType *FTy = Func->getFunctionType();
25553 const AttributeList &Attrs = Func->getAttributes();
25554
25555 if (!Attrs.isEmpty() && !Func->isVarArg()) {
25556 unsigned InRegCount = 0;
25557 unsigned Idx = 1;
25558
25559 for (FunctionType::param_iterator I = FTy->param_begin(),
25560 E = FTy->param_end(); I != E; ++I, ++Idx)
25561 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
25562 auto &DL = DAG.getDataLayout();
25563 // FIXME: should only count parameters that are lowered to integers.
25564 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
25565 }
25566
25567 if (InRegCount > 2) {
25568 report_fatal_error("Nest register in use - reduce number of inreg"
25569 " parameters!");
25570 }
25571 }
25572 break;
25573 }
25574 case CallingConv::X86_FastCall:
25575 case CallingConv::X86_ThisCall:
25576 case CallingConv::Fast:
25577 case CallingConv::Tail:
25578 // Pass 'nest' parameter in EAX.
25579 // Must be kept in sync with X86CallingConv.td
25580 NestReg = X86::EAX;
25581 break;
25582 }
25583
25584 SDValue OutChains[4];
25585 SDValue Addr, Disp;
25586
25587 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
25588 DAG.getConstant(10, dl, MVT::i32));
25589 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
25590
25591 // This is storing the opcode for MOV32ri.
25592 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
25593 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
25594 OutChains[0] =
25595 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
25596 Trmp, MachinePointerInfo(TrmpAddr));
25597
25598 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
25599 DAG.getConstant(1, dl, MVT::i32));
25600 OutChains[1] =
25601 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
25602 /* Alignment = */ 1);
25603
25604 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
25605 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
25606 DAG.getConstant(5, dl, MVT::i32));
25607 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
25608 Addr, MachinePointerInfo(TrmpAddr, 5),
25609 /* Alignment = */ 1);
25610
25611 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
25612 DAG.getConstant(6, dl, MVT::i32));
25613 OutChains[3] =
25614 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
25615 /* Alignment = */ 1);
25616
25617 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
25618 }
25619}
25620
25621SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
25622 SelectionDAG &DAG) const {
25623 /*
25624 The rounding mode is in bits 11:10 of FPSR, and has the following
25625 settings:
25626 00 Round to nearest
25627 01 Round to -inf
25628 10 Round to +inf
25629 11 Round to 0
25630
25631 FLT_ROUNDS, on the other hand, expects the following:
25632 -1 Undefined
25633 0 Round to 0
25634 1 Round to nearest
25635 2 Round to +inf
25636 3 Round to -inf
25637
25638 To perform the conversion, we use a packed lookup table of the four 2-bit
25639 values that we can index by FPSP[11:10]
25640 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
25641
25642 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
25643 */
25644
25645 MachineFunction &MF = DAG.getMachineFunction();
25646 MVT VT = Op.getSimpleValueType();
25647 SDLoc DL(Op);
25648
25649 // Save FP Control Word to stack slot
25650 int SSFI =
25651 MF.getFrameInfo().CreateStackObject(2, 2, false);
25652 SDValue StackSlot =
25653 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
25654
25655 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
25656
25657 SDValue Chain = Op.getOperand(0);
25658 SDValue Ops[] = {Chain, StackSlot};
25659 Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
25660 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
25661 2 /*Align*/, MachineMemOperand::MOStore);
25662
25663 // Load FP Control Word from stack slot
25664 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, 2 /*Align*/);
25665 Chain = CWD.getValue(1);
25666
25667 // Mask and turn the control bits into a shift for the lookup table.
25668 SDValue Shift =
25669 DAG.getNode(ISD::SRL, DL, MVT::i16,
25670 DAG.getNode(ISD::AND, DL, MVT::i16,
25671 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
25672 DAG.getConstant(9, DL, MVT::i8));
25673 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
25674
25675 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
25676 SDValue RetVal =
25677 DAG.getNode(ISD::AND, DL, MVT::i32,
25678 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
25679 DAG.getConstant(3, DL, MVT::i32));
25680
25681 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
25682
25683 return DAG.getMergeValues({RetVal, Chain}, DL);
25684}
25685
25686// Split an unary integer op into 2 half sized ops.
25687static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
25688 MVT VT = Op.getSimpleValueType();
25689 unsigned NumElems = VT.getVectorNumElements();
25690 unsigned SizeInBits = VT.getSizeInBits();
25691 MVT EltVT = VT.getVectorElementType();
25692 SDValue Src = Op.getOperand(0);
25693 assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&((EltVT == Src.getSimpleValueType().getVectorElementType() &&
"Src and Op should have the same element type!") ? static_cast
<void> (0) : __assert_fail ("EltVT == Src.getSimpleValueType().getVectorElementType() && \"Src and Op should have the same element type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25694, __PRETTY_FUNCTION__))
25694 "Src and Op should have the same element type!")((EltVT == Src.getSimpleValueType().getVectorElementType() &&
"Src and Op should have the same element type!") ? static_cast
<void> (0) : __assert_fail ("EltVT == Src.getSimpleValueType().getVectorElementType() && \"Src and Op should have the same element type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25694, __PRETTY_FUNCTION__))
;
25695
25696 // Extract the Lo/Hi vectors
25697 SDLoc dl(Op);
25698 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
25699 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
25700
25701 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
25702 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25703 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
25704 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
25705}
25706
25707// Decompose 256-bit ops into smaller 128-bit ops.
25708static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
25709 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25711, __PRETTY_FUNCTION__))
25710 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25711, __PRETTY_FUNCTION__))
25711 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25711, __PRETTY_FUNCTION__))
;
25712 return LowerVectorIntUnary(Op, DAG);
25713}
25714
25715// Decompose 512-bit ops into smaller 256-bit ops.
25716static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
25717 assert(Op.getSimpleValueType().is512BitVector() &&((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25719, __PRETTY_FUNCTION__))
25718 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25719, __PRETTY_FUNCTION__))
25719 "Only handle AVX 512-bit vector integer operation")((Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 512-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is512BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 512-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25719, __PRETTY_FUNCTION__))
;
25720 return LowerVectorIntUnary(Op, DAG);
25721}
25722
25723/// Lower a vector CTLZ using native supported vector CTLZ instruction.
25724//
25725// i8/i16 vector implemented using dword LZCNT vector instruction
25726// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
25727// split the vector, perform operation on it's Lo a Hi part and
25728// concatenate the results.
25729static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
25730 const X86Subtarget &Subtarget) {
25731 assert(Op.getOpcode() == ISD::CTLZ)((Op.getOpcode() == ISD::CTLZ) ? static_cast<void> (0) :
__assert_fail ("Op.getOpcode() == ISD::CTLZ", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25731, __PRETTY_FUNCTION__))
;
25732 SDLoc dl(Op);
25733 MVT VT = Op.getSimpleValueType();
25734 MVT EltVT = VT.getVectorElementType();
25735 unsigned NumElems = VT.getVectorNumElements();
25736
25737 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25738, __PRETTY_FUNCTION__))
25738 "Unsupported element type")(((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type"
) ? static_cast<void> (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25738, __PRETTY_FUNCTION__))
;
25739
25740 // Split vector, it's Lo and Hi parts will be handled in next iteration.
25741 if (NumElems > 16 ||
25742 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
25743 return LowerVectorIntUnary(Op, DAG);
25744
25745 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
25746 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation") ? static_cast<void
> (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25747, __PRETTY_FUNCTION__))
25747 "Unsupported value type for operation")(((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
"Unsupported value type for operation") ? static_cast<void
> (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25747, __PRETTY_FUNCTION__))
;
25748
25749 // Use native supported vector instruction vplzcntd.
25750 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
25751 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
25752 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
25753 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
25754
25755 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
25756}
25757
25758// Lower CTLZ using a PSHUFB lookup table implementation.
25759static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
25760 const X86Subtarget &Subtarget,
25761 SelectionDAG &DAG) {
25762 MVT VT = Op.getSimpleValueType();
25763 int NumElts = VT.getVectorNumElements();
25764 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
25765 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
25766
25767 // Per-nibble leading zero PSHUFB lookup table.
25768 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
25769 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
25770 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
25771 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
25772
25773 SmallVector<SDValue, 64> LUTVec;
25774 for (int i = 0; i < NumBytes; ++i)
25775 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
25776 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
25777
25778 // Begin by bitcasting the input to byte vector, then split those bytes
25779 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
25780 // If the hi input nibble is zero then we add both results together, otherwise
25781 // we just take the hi result (by masking the lo result to zero before the
25782 // add).
25783 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
25784 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
25785
25786 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
25787 SDValue Lo = Op0;
25788 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
25789 SDValue HiZ;
25790 if (CurrVT.is512BitVector()) {
25791 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
25792 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
25793 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
25794 } else {
25795 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
25796 }
25797
25798 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
25799 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
25800 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
25801 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
25802
25803 // Merge result back from vXi8 back to VT, working on the lo/hi halves
25804 // of the current vector width in the same way we did for the nibbles.
25805 // If the upper half of the input element is zero then add the halves'
25806 // leading zero counts together, otherwise just use the upper half's.
25807 // Double the width of the result until we are at target width.
25808 while (CurrVT != VT) {
25809 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
25810 int CurrNumElts = CurrVT.getVectorNumElements();
25811 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
25812 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
25813 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
25814
25815 // Check if the upper half of the input element is zero.
25816 if (CurrVT.is512BitVector()) {
25817 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
25818 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
25819 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
25820 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
25821 } else {
25822 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
25823 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
25824 }
25825 HiZ = DAG.getBitcast(NextVT, HiZ);
25826
25827 // Move the upper/lower halves to the lower bits as we'll be extending to
25828 // NextVT. Mask the lower result to zero if HiZ is true and add the results
25829 // together.
25830 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
25831 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
25832 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
25833 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
25834 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
25835 CurrVT = NextVT;
25836 }
25837
25838 return Res;
25839}
25840
25841static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
25842 const X86Subtarget &Subtarget,
25843 SelectionDAG &DAG) {
25844 MVT VT = Op.getSimpleValueType();
25845
25846 if (Subtarget.hasCDI() &&
25847 // vXi8 vectors need to be promoted to 512-bits for vXi32.
25848 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
25849 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
25850
25851 // Decompose 256-bit ops into smaller 128-bit ops.
25852 if (VT.is256BitVector() && !Subtarget.hasInt256())
25853 return Lower256IntUnary(Op, DAG);
25854
25855 // Decompose 512-bit ops into smaller 256-bit ops.
25856 if (VT.is512BitVector() && !Subtarget.hasBWI())
25857 return Lower512IntUnary(Op, DAG);
25858
25859 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")((Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25859, __PRETTY_FUNCTION__))
;
25860 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
25861}
25862
25863static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
25864 SelectionDAG &DAG) {
25865 MVT VT = Op.getSimpleValueType();
25866 MVT OpVT = VT;
25867 unsigned NumBits = VT.getSizeInBits();
25868 SDLoc dl(Op);
25869 unsigned Opc = Op.getOpcode();
25870
25871 if (VT.isVector())
25872 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
25873
25874 Op = Op.getOperand(0);
25875 if (VT == MVT::i8) {
25876 // Zero extend to i32 since there is not an i8 bsr.
25877 OpVT = MVT::i32;
25878 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
25879 }
25880
25881 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
25882 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
25883 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
25884
25885 if (Opc == ISD::CTLZ) {
25886 // If src is zero (i.e. bsr sets ZF), returns NumBits.
25887 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
25888 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
25889 Op.getValue(1)};
25890 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
25891 }
25892
25893 // Finally xor with NumBits-1.
25894 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
25895 DAG.getConstant(NumBits - 1, dl, OpVT));
25896
25897 if (VT == MVT::i8)
25898 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
25899 return Op;
25900}
25901
25902static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
25903 SelectionDAG &DAG) {
25904 MVT VT = Op.getSimpleValueType();
25905 unsigned NumBits = VT.getScalarSizeInBits();
25906 SDValue N0 = Op.getOperand(0);
25907 SDLoc dl(Op);
25908
25909 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&((!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? static_cast<
void> (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25910, __PRETTY_FUNCTION__))
25910 "Only scalar CTTZ requires custom lowering")((!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
"Only scalar CTTZ requires custom lowering") ? static_cast<
void> (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25910, __PRETTY_FUNCTION__))
;
25911
25912 // Issue a bsf (scan bits forward) which also sets EFLAGS.
25913 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
25914 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
25915
25916 // If src is zero (i.e. bsf sets ZF), returns NumBits.
25917 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
25918 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
25919 Op.getValue(1)};
25920 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
25921}
25922
25923/// Break a 256-bit integer operation into two new 128-bit ones and then
25924/// concatenate the result back.
25925static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
25926 MVT VT = Op.getSimpleValueType();
25927
25928 assert(VT.is256BitVector() && VT.isInteger() &&((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25929, __PRETTY_FUNCTION__))
25929 "Unsupported value type for operation")((VT.is256BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is256BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25929, __PRETTY_FUNCTION__))
;
25930
25931 unsigned NumElems = VT.getVectorNumElements();
25932 SDLoc dl(Op);
25933
25934 // Extract the LHS vectors
25935 SDValue LHS = Op.getOperand(0);
25936 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
25937 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
25938
25939 // Extract the RHS vectors
25940 SDValue RHS = Op.getOperand(1);
25941 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
25942 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
25943
25944 MVT EltVT = VT.getVectorElementType();
25945 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
25946
25947 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25948 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
25949 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
25950}
25951
25952/// Break a 512-bit integer operation into two new 256-bit ones and then
25953/// concatenate the result back.
25954static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
25955 MVT VT = Op.getSimpleValueType();
25956
25957 assert(VT.is512BitVector() && VT.isInteger() &&((VT.is512BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25958, __PRETTY_FUNCTION__))
25958 "Unsupported value type for operation")((VT.is512BitVector() && VT.isInteger() && "Unsupported value type for operation"
) ? static_cast<void> (0) : __assert_fail ("VT.is512BitVector() && VT.isInteger() && \"Unsupported value type for operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25958, __PRETTY_FUNCTION__))
;
25959
25960 unsigned NumElems = VT.getVectorNumElements();
25961 SDLoc dl(Op);
25962
25963 // Extract the LHS vectors
25964 SDValue LHS = Op.getOperand(0);
25965 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
25966 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
25967
25968 // Extract the RHS vectors
25969 SDValue RHS = Op.getOperand(1);
25970 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
25971 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
25972
25973 MVT EltVT = VT.getVectorElementType();
25974 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
25975
25976 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
25977 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
25978 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
25979}
25980
25981static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
25982 const X86Subtarget &Subtarget) {
25983 MVT VT = Op.getSimpleValueType();
25984 if (VT == MVT::i16 || VT == MVT::i32)
25985 return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
25986
25987 if (VT.getScalarType() == MVT::i1)
25988 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
25989 Op.getOperand(0), Op.getOperand(1));
25990
25991 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25993, __PRETTY_FUNCTION__))
25992 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25993, __PRETTY_FUNCTION__))
25993 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25993, __PRETTY_FUNCTION__))
;
25994 return split256IntArith(Op, DAG);
25995}
25996
25997static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
25998 const X86Subtarget &Subtarget) {
25999 MVT VT = Op.getSimpleValueType();
26000 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
26001 unsigned Opcode = Op.getOpcode();
26002 if (VT.getScalarType() == MVT::i1) {
26003 SDLoc dl(Op);
26004 switch (Opcode) {
26005 default: llvm_unreachable("Expected saturated arithmetic opcode")::llvm::llvm_unreachable_internal("Expected saturated arithmetic opcode"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26005)
;
26006 case ISD::UADDSAT:
26007 case ISD::SADDSAT:
26008 // *addsat i1 X, Y --> X | Y
26009 return DAG.getNode(ISD::OR, dl, VT, X, Y);
26010 case ISD::USUBSAT:
26011 case ISD::SSUBSAT:
26012 // *subsat i1 X, Y --> X & ~Y
26013 return DAG.getNode(ISD::AND, dl, VT, X, DAG.getNOT(dl, Y, VT));
26014 }
26015 }
26016
26017 if (VT.is128BitVector()) {
26018 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
26019 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26020 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
26021 *DAG.getContext(), VT);
26022 SDLoc DL(Op);
26023 if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) {
26024 // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y
26025 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y);
26026 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT);
26027 return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add);
26028 }
26029 if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) {
26030 // usubsat X, Y --> (X >u Y) ? X - Y : 0
26031 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
26032 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
26033 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
26034 }
26035 // Use default expansion.
26036 return SDValue();
26037 }
26038
26039 assert(Op.getSimpleValueType().is256BitVector() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26041, __PRETTY_FUNCTION__))
26040 Op.getSimpleValueType().isInteger() &&((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26041, __PRETTY_FUNCTION__))
26041 "Only handle AVX 256-bit vector integer operation")((Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType
().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26041, __PRETTY_FUNCTION__))
;
26042 return split256IntArith(Op, DAG);
26043}
26044
26045static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
26046 SelectionDAG &DAG) {
26047 MVT VT = Op.getSimpleValueType();
26048 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
26049 // Since X86 does not have CMOV for 8-bit integer, we don't convert
26050 // 8-bit integer abs to NEG and CMOV.
26051 SDLoc DL(Op);
26052 SDValue N0 = Op.getOperand(0);
26053 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
26054 DAG.getConstant(0, DL, VT), N0);
26055 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_GE, DL, MVT::i8),
26056 SDValue(Neg.getNode(), 1)};
26057 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
26058 }
26059
26060 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
26061 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
26062 SDLoc DL(Op);
26063 SDValue Src = Op.getOperand(0);
26064 SDValue Sub =
26065 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
26066 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
26067 }
26068
26069 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
26070 assert(VT.isInteger() &&((VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26071, __PRETTY_FUNCTION__))
26071 "Only handle AVX 256-bit vector integer operation")((VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? static_cast<void> (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26071, __PRETTY_FUNCTION__))
;
26072 return Lower256IntUnary(Op, DAG);
26073 }
26074
26075 // Default to expand.
26076 return SDValue();
26077}
26078
26079static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
26080 MVT VT = Op.getSimpleValueType();
26081
26082 // For AVX1 cases, split to use legal ops (everything but v4i64).
26083 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
26084 return split256IntArith(Op, DAG);
26085
26086 SDLoc DL(Op);
26087 unsigned Opcode = Op.getOpcode();
26088 SDValue N0 = Op.getOperand(0);
26089 SDValue N1 = Op.getOperand(1);
26090
26091 // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
26092 // using the SMIN/SMAX instructions and flipping the signbit back.
26093 if (VT == MVT::v8i16) {
26094 assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&(((Opcode == ISD::UMIN || Opcode == ISD::UMAX) && "Unexpected MIN/MAX opcode"
) ? static_cast<void> (0) : __assert_fail ("(Opcode == ISD::UMIN || Opcode == ISD::UMAX) && \"Unexpected MIN/MAX opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26095, __PRETTY_FUNCTION__))
26095 "Unexpected MIN/MAX opcode")(((Opcode == ISD::UMIN || Opcode == ISD::UMAX) && "Unexpected MIN/MAX opcode"
) ? static_cast<void> (0) : __assert_fail ("(Opcode == ISD::UMIN || Opcode == ISD::UMAX) && \"Unexpected MIN/MAX opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26095, __PRETTY_FUNCTION__))
;
26096 SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
26097 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
26098 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
26099 Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
26100 SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
26101 return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
26102 }
26103
26104 // Else, expand to a compare/select.
26105 ISD::CondCode CC;
26106 switch (Opcode) {
26107 case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
26108 case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
26109 case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
26110 case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
26111 default: llvm_unreachable("Unknown MINMAX opcode")::llvm::llvm_unreachable_internal("Unknown MINMAX opcode", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26111)
;
26112 }
26113
26114 SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
26115 return DAG.getSelect(DL, VT, Cond, N0, N1);
26116}
26117
26118static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
26119 SelectionDAG &DAG) {
26120 SDLoc dl(Op);
26121 MVT VT = Op.getSimpleValueType();
26122
26123 if (VT.getScalarType() == MVT::i1)
26124 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
26125
26126 // Decompose 256-bit ops into 128-bit ops.
26127 if (VT.is256BitVector() && !Subtarget.hasInt256())
26128 return split256IntArith(Op, DAG);
26129
26130 SDValue A = Op.getOperand(0);
26131 SDValue B = Op.getOperand(1);
26132
26133 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
26134 // vector pairs, multiply and truncate.
26135 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
26136 unsigned NumElts = VT.getVectorNumElements();
26137
26138 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
26139 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
26140 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
26141 return DAG.getNode(
26142 ISD::TRUNCATE, dl, VT,
26143 DAG.getNode(ISD::MUL, dl, ExVT,
26144 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
26145 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
26146 }
26147
26148 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
26149
26150 // Extract the lo/hi parts to any extend to i16.
26151 // We're going to mask off the low byte of each result element of the
26152 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
26153 // element.
26154 SDValue Undef = DAG.getUNDEF(VT);
26155 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
26156 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
26157
26158 SDValue BLo, BHi;
26159 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
26160 // If the LHS is a constant, manually unpackl/unpackh.
26161 SmallVector<SDValue, 16> LoOps, HiOps;
26162 for (unsigned i = 0; i != NumElts; i += 16) {
26163 for (unsigned j = 0; j != 8; ++j) {
26164 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
26165 MVT::i16));
26166 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
26167 MVT::i16));
26168 }
26169 }
26170
26171 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
26172 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
26173 } else {
26174 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
26175 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
26176 }
26177
26178 // Multiply, mask the lower 8bits of the lo/hi results and pack.
26179 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
26180 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
26181 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
26182 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
26183 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
26184 }
26185
26186 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
26187 if (VT == MVT::v4i32) {
26188 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&((Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26189, __PRETTY_FUNCTION__))
26189 "Should not custom lower when pmulld is available!")((Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
"Should not custom lower when pmulld is available!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26189, __PRETTY_FUNCTION__))
;
26190
26191 // Extract the odd parts.
26192 static const int UnpackMask[] = { 1, -1, 3, -1 };
26193 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
26194 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
26195
26196 // Multiply the even parts.
26197 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
26198 DAG.getBitcast(MVT::v2i64, A),
26199 DAG.getBitcast(MVT::v2i64, B));
26200 // Now multiply odd parts.
26201 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
26202 DAG.getBitcast(MVT::v2i64, Aodds),
26203 DAG.getBitcast(MVT::v2i64, Bodds));
26204
26205 Evens = DAG.getBitcast(VT, Evens);
26206 Odds = DAG.getBitcast(VT, Odds);
26207
26208 // Merge the two vectors back together with a shuffle. This expands into 2
26209 // shuffles.
26210 static const int ShufMask[] = { 0, 4, 2, 6 };
26211 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
26212 }
26213
26214 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26215, __PRETTY_FUNCTION__))
26215 "Only know how to lower V2I64/V4I64/V8I64 multiply")(((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
"Only know how to lower V2I64/V4I64/V8I64 multiply") ? static_cast
<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26215, __PRETTY_FUNCTION__))
;
26216 assert(!Subtarget.hasDQI() && "DQI should use MULLQ")((!Subtarget.hasDQI() && "DQI should use MULLQ") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26216, __PRETTY_FUNCTION__))
;
26217
26218 // Ahi = psrlqi(a, 32);
26219 // Bhi = psrlqi(b, 32);
26220 //
26221 // AloBlo = pmuludq(a, b);
26222 // AloBhi = pmuludq(a, Bhi);
26223 // AhiBlo = pmuludq(Ahi, b);
26224 //
26225 // Hi = psllqi(AloBhi + AhiBlo, 32);
26226 // return AloBlo + Hi;
26227 KnownBits AKnown = DAG.computeKnownBits(A);
26228 KnownBits BKnown = DAG.computeKnownBits(B);
26229
26230 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
26231 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
26232 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
26233
26234 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
26235 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
26236 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
26237
26238 SDValue Zero = DAG.getConstant(0, dl, VT);
26239
26240 // Only multiply lo/hi halves that aren't known to be zero.
26241 SDValue AloBlo = Zero;
26242 if (!ALoIsZero && !BLoIsZero)
26243 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
26244
26245 SDValue AloBhi = Zero;
26246 if (!ALoIsZero && !BHiIsZero) {
26247 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
26248 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
26249 }
26250
26251 SDValue AhiBlo = Zero;
26252 if (!AHiIsZero && !BLoIsZero) {
26253 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
26254 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
26255 }
26256
26257 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
26258 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
26259
26260 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
26261}
26262
26263static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
26264 SelectionDAG &DAG) {
26265 SDLoc dl(Op);
26266 MVT VT = Op.getSimpleValueType();
26267 bool IsSigned = Op->getOpcode() == ISD::MULHS;
26268 unsigned NumElts = VT.getVectorNumElements();
26269 SDValue A = Op.getOperand(0);
26270 SDValue B = Op.getOperand(1);
26271
26272 // Decompose 256-bit ops into 128-bit ops.
26273 if (VT.is256BitVector() && !Subtarget.hasInt256())
26274 return split256IntArith(Op, DAG);
26275
26276 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
26277 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26279, __PRETTY_FUNCTION__))
26278 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26279, __PRETTY_FUNCTION__))
26279 (VT == MVT::v16i32 && Subtarget.hasAVX512()))(((VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT ==
MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::
v16i32 && Subtarget.hasAVX512())) ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26279, __PRETTY_FUNCTION__))
;
26280
26281 // PMULxD operations multiply each even value (starting at 0) of LHS with
26282 // the related value of RHS and produce a widen result.
26283 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
26284 // => <2 x i64> <ae|cg>
26285 //
26286 // In other word, to have all the results, we need to perform two PMULxD:
26287 // 1. one with the even values.
26288 // 2. one with the odd values.
26289 // To achieve #2, with need to place the odd values at an even position.
26290 //
26291 // Place the odd value at an even position (basically, shift all values 1
26292 // step to the left):
26293 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
26294 9, -1, 11, -1, 13, -1, 15, -1};
26295 // <a|b|c|d> => <b|undef|d|undef>
26296 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
26297 makeArrayRef(&Mask[0], NumElts));
26298 // <e|f|g|h> => <f|undef|h|undef>
26299 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
26300 makeArrayRef(&Mask[0], NumElts));
26301
26302 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
26303 // ints.
26304 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
26305 unsigned Opcode =
26306 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
26307 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
26308 // => <2 x i64> <ae|cg>
26309 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
26310 DAG.getBitcast(MulVT, A),
26311 DAG.getBitcast(MulVT, B)));
26312 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
26313 // => <2 x i64> <bf|dh>
26314 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
26315 DAG.getBitcast(MulVT, Odd0),
26316 DAG.getBitcast(MulVT, Odd1)));
26317
26318 // Shuffle it back into the right order.
26319 SmallVector<int, 16> ShufMask(NumElts);
26320 for (int i = 0; i != (int)NumElts; ++i)
26321 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
26322
26323 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
26324
26325 // If we have a signed multiply but no PMULDQ fix up the result of an
26326 // unsigned multiply.
26327 if (IsSigned && !Subtarget.hasSSE41()) {
26328 SDValue Zero = DAG.getConstant(0, dl, VT);
26329 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
26330 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
26331 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
26332 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
26333
26334 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
26335 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
26336 }
26337
26338 return Res;
26339 }
26340
26341 // Only i8 vectors should need custom lowering after this.
26342 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26344, __PRETTY_FUNCTION__))
26343 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26344, __PRETTY_FUNCTION__))
26344 "Unsupported vector type")(((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget
.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI
())) && "Unsupported vector type") ? static_cast<void
> (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26344, __PRETTY_FUNCTION__))
;
26345
26346 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
26347 // logical shift down the upper half and pack back to i8.
26348
26349 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
26350 // and then ashr/lshr the upper bits down to the lower bits before multiply.
26351 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
26352
26353 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
26354 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
26355 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
26356 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
26357 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
26358 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
26359 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
26360 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
26361 }
26362
26363 // For signed 512-bit vectors, split into 256-bit vectors to allow the
26364 // sign-extension to occur.
26365 if (VT == MVT::v64i8 && IsSigned)
26366 return split512IntArith(Op, DAG);
26367
26368 // Signed AVX2 implementation - extend xmm subvectors to ymm.
26369 if (VT == MVT::v32i8 && IsSigned) {
26370 MVT ExVT = MVT::v16i16;
26371 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
26372 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
26373 SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
26374 SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
26375 ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
26376 BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
26377 AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
26378 BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
26379 SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
26380 SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
26381 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
26382 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
26383
26384 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
26385 // Shuffle lowering should turn this into PACKUS+PERMQ
26386 Lo = DAG.getBitcast(VT, Lo);
26387 Hi = DAG.getBitcast(VT, Hi);
26388 return DAG.getVectorShuffle(VT, dl, Lo, Hi,
26389 { 0, 2, 4, 6, 8, 10, 12, 14,
26390 16, 18, 20, 22, 24, 26, 28, 30,
26391 32, 34, 36, 38, 40, 42, 44, 46,
26392 48, 50, 52, 54, 56, 58, 60, 62});
26393 }
26394
26395 // For signed v16i8 and all unsigned vXi8 we will unpack the low and high
26396 // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
26397 // shift the results and pack the half lane results back together.
26398
26399 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
26400
26401 static const int PSHUFDMask[] = { 8, 9, 10, 11, 12, 13, 14, 15,
26402 -1, -1, -1, -1, -1, -1, -1, -1};
26403
26404 // Extract the lo parts and zero/sign extend to i16.
26405 // Only use SSE4.1 instructions for signed v16i8 where using unpack requires
26406 // shifts to sign extend. Using unpack for unsigned only requires an xor to
26407 // create zeros and a copy due to tied registers contraints pre-avx. But using
26408 // zero_extend_vector_inreg would require an additional pshufd for the high
26409 // part.
26410
26411 SDValue ALo, AHi;
26412 if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
26413 ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
26414
26415 AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
26416 AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
26417 } else if (IsSigned) {
26418 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
26419 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
26420
26421 ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
26422 AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
26423 } else {
26424 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
26425 DAG.getConstant(0, dl, VT)));
26426 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
26427 DAG.getConstant(0, dl, VT)));
26428 }
26429
26430 SDValue BLo, BHi;
26431 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
26432 // If the LHS is a constant, manually unpackl/unpackh and extend.
26433 SmallVector<SDValue, 16> LoOps, HiOps;
26434 for (unsigned i = 0; i != NumElts; i += 16) {
26435 for (unsigned j = 0; j != 8; ++j) {
26436 SDValue LoOp = B.getOperand(i + j);
26437 SDValue HiOp = B.getOperand(i + j + 8);
26438
26439 if (IsSigned) {
26440 LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
26441 HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
26442 } else {
26443 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
26444 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
26445 }
26446
26447 LoOps.push_back(LoOp);
26448 HiOps.push_back(HiOp);
26449 }
26450 }
26451
26452 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
26453 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
26454 } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
26455 BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
26456
26457 BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
26458 BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
26459 } else if (IsSigned) {
26460 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
26461 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
26462
26463 BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
26464 BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
26465 } else {
26466 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
26467 DAG.getConstant(0, dl, VT)));
26468 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
26469 DAG.getConstant(0, dl, VT)));
26470 }
26471
26472 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
26473 // pack back to vXi8.
26474 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
26475 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
26476 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
26477 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
26478
26479 // Bitcast back to VT and then pack all the even elements from Lo and Hi.
26480 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
26481}
26482
26483SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
26484 assert(Subtarget.isTargetWin64() && "Unexpected target")((Subtarget.isTargetWin64() && "Unexpected target") ?
static_cast<void> (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26484, __PRETTY_FUNCTION__))
;
26485 EVT VT = Op.getValueType();
26486 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26487, __PRETTY_FUNCTION__))
26487 "Unexpected return type for lowering")((VT.isInteger() && VT.getSizeInBits() == 128 &&
"Unexpected return type for lowering") ? static_cast<void
> (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26487, __PRETTY_FUNCTION__))
;
26488
26489 RTLIB::Libcall LC;
26490 bool isSigned;
26491 switch (Op->getOpcode()) {
26492 default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26492)
;
26493 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
26494 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
26495 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
26496 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
26497 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
26498 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
26499 }
26500
26501 SDLoc dl(Op);
26502 SDValue InChain = DAG.getEntryNode();
26503
26504 TargetLowering::ArgListTy Args;
26505 TargetLowering::ArgListEntry Entry;
26506 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
26507 EVT ArgVT = Op->getOperand(i).getValueType();
26508 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26509, __PRETTY_FUNCTION__))
26509 "Unexpected argument type for lowering")((ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering") ? static_cast<void
> (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26509, __PRETTY_FUNCTION__))
;
26510 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
26511 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
26512 MachinePointerInfo MPI =
26513 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
26514 Entry.Node = StackPtr;
26515 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
26516 MPI, /* Alignment = */ 16);
26517 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
26518 Entry.Ty = PointerType::get(ArgTy,0);
26519 Entry.IsSExt = false;
26520 Entry.IsZExt = false;
26521 Args.push_back(Entry);
26522 }
26523
26524 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
26525 getPointerTy(DAG.getDataLayout()));
26526
26527 TargetLowering::CallLoweringInfo CLI(DAG);
26528 CLI.setDebugLoc(dl)
26529 .setChain(InChain)
26530 .setLibCallee(
26531 getLibcallCallingConv(LC),
26532 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
26533 std::move(Args))
26534 .setInRegister()
26535 .setSExtResult(isSigned)
26536 .setZExtResult(!isSigned);
26537
26538 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
26539 return DAG.getBitcast(VT, CallInfo.first);
26540}
26541
26542// Return true if the required (according to Opcode) shift-imm form is natively
26543// supported by the Subtarget
26544static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
26545 unsigned Opcode) {
26546 if (VT.getScalarSizeInBits() < 16)
26547 return false;
26548
26549 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
26550 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
26551 return true;
26552
26553 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
26554 (VT.is256BitVector() && Subtarget.hasInt256());
26555
26556 bool AShift = LShift && (Subtarget.hasAVX512() ||
26557 (VT != MVT::v2i64 && VT != MVT::v4i64));
26558 return (Opcode == ISD::SRA) ? AShift : LShift;
26559}
26560
26561// The shift amount is a variable, but it is the same for all vector lanes.
26562// These instructions are defined together with shift-immediate.
26563static
26564bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
26565 unsigned Opcode) {
26566 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
26567}
26568
26569// Return true if the required (according to Opcode) variable-shift form is
26570// natively supported by the Subtarget
26571static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
26572 unsigned Opcode) {
26573
26574 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
26575 return false;
26576
26577 // vXi16 supported only on AVX-512, BWI
26578 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
26579 return false;
26580
26581 if (Subtarget.hasAVX512())
26582 return true;
26583
26584 bool LShift = VT.is128BitVector() || VT.is256BitVector();
26585 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
26586 return (Opcode == ISD::SRA) ? AShift : LShift;
26587}
26588
26589static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
26590 const X86Subtarget &Subtarget) {
26591 MVT VT = Op.getSimpleValueType();
26592 SDLoc dl(Op);
26593 SDValue R = Op.getOperand(0);
26594 SDValue Amt = Op.getOperand(1);
26595 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
26596
26597 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
26598 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26598, __PRETTY_FUNCTION__))
;
26599 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
26600 SDValue Ex = DAG.getBitcast(ExVT, R);
26601
26602 // ashr(R, 63) === cmp_slt(R, 0)
26603 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
26604 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"
) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26605, __PRETTY_FUNCTION__))
26605 "Unsupported PCMPGT op")(((VT != MVT::v4i64 || Subtarget.hasInt256()) && "Unsupported PCMPGT op"
) ? static_cast<void> (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26605, __PRETTY_FUNCTION__))
;
26606 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
26607 }
26608
26609 if (ShiftAmt >= 32) {
26610 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
26611 SDValue Upper =
26612 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
26613 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
26614 ShiftAmt - 32, DAG);
26615 if (VT == MVT::v2i64)
26616 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
26617 if (VT == MVT::v4i64)
26618 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
26619 {9, 1, 11, 3, 13, 5, 15, 7});
26620 } else {
26621 // SRA upper i32, SRL whole i64 and select lower i32.
26622 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
26623 ShiftAmt, DAG);
26624 SDValue Lower =
26625 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
26626 Lower = DAG.getBitcast(ExVT, Lower);
26627 if (VT == MVT::v2i64)
26628 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
26629 if (VT == MVT::v4i64)
26630 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
26631 {8, 1, 10, 3, 12, 5, 14, 7});
26632 }
26633 return DAG.getBitcast(VT, Ex);
26634 };
26635
26636 // Optimize shl/srl/sra with constant shift amount.
26637 APInt APIntShiftAmt;
26638 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
26639 return SDValue();
26640
26641 // If the shift amount is out of range, return undef.
26642 if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
26643 return DAG.getUNDEF(VT);
26644
26645 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
26646
26647 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
26648 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
26649
26650 // i64 SRA needs to be performed as partial shifts.
26651 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
26652 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
26653 Op.getOpcode() == ISD::SRA)
26654 return ArithmeticShiftRight64(ShiftAmt);
26655
26656 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
26657 VT == MVT::v64i8) {
26658 unsigned NumElts = VT.getVectorNumElements();
26659 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
26660
26661 // Simple i8 add case
26662 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
26663 return DAG.getNode(ISD::ADD, dl, VT, R, R);
26664
26665 // ashr(R, 7) === cmp_slt(R, 0)
26666 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
26667 SDValue Zeros = DAG.getConstant(0, dl, VT);
26668 if (VT.is512BitVector()) {
26669 assert(VT == MVT::v64i8 && "Unexpected element type!")((VT == MVT::v64i8 && "Unexpected element type!") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26669, __PRETTY_FUNCTION__))
;
26670 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
26671 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
26672 }
26673 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
26674 }
26675
26676 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
26677 if (VT == MVT::v16i8 && Subtarget.hasXOP())
26678 return SDValue();
26679
26680 if (Op.getOpcode() == ISD::SHL) {
26681 // Make a large shift.
26682 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
26683 ShiftAmt, DAG);
26684 SHL = DAG.getBitcast(VT, SHL);
26685 // Zero out the rightmost bits.
26686 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
26687 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
26688 }
26689 if (Op.getOpcode() == ISD::SRL) {
26690 // Make a large shift.
26691 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
26692 ShiftAmt, DAG);
26693 SRL = DAG.getBitcast(VT, SRL);
26694 // Zero out the leftmost bits.
26695 return DAG.getNode(ISD::AND, dl, VT, SRL,
26696 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
26697 }
26698 if (Op.getOpcode() == ISD::SRA) {
26699 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
26700 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
26701
26702 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
26703 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
26704 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
26705 return Res;
26706 }
26707 llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26707)
;
26708 }
26709
26710 return SDValue();
26711}
26712
26713static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
26714 const X86Subtarget &Subtarget) {
26715 MVT VT = Op.getSimpleValueType();
26716 SDLoc dl(Op);
26717 SDValue R = Op.getOperand(0);
26718 SDValue Amt = Op.getOperand(1);
26719 unsigned Opcode = Op.getOpcode();
26720 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
26721 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
26722
26723 if (SDValue BaseShAmt = DAG.getSplatValue(Amt)) {
26724 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
26725 MVT EltVT = VT.getVectorElementType();
26726 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!")((EltVT.bitsLE(MVT::i64) && "Unexpected element type!"
) ? static_cast<void> (0) : __assert_fail ("EltVT.bitsLE(MVT::i64) && \"Unexpected element type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26726, __PRETTY_FUNCTION__))
;
26727 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
26728 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
26729 else if (EltVT.bitsLT(MVT::i32))
26730 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
26731
26732 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
26733 }
26734
26735 // vXi8 shifts - shift as v8i16 + mask result.
26736 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
26737 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
26738 VT == MVT::v64i8) &&
26739 !Subtarget.hasXOP()) {
26740 unsigned NumElts = VT.getVectorNumElements();
26741 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
26742 if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
26743 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
26744 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
26745 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
26746
26747 // Create the mask using vXi16 shifts. For shift-rights we need to move
26748 // the upper byte down before splatting the vXi8 mask.
26749 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
26750 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
26751 BaseShAmt, Subtarget, DAG);
26752 if (Opcode != ISD::SHL)
26753 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
26754 8, DAG);
26755 BitMask = DAG.getBitcast(VT, BitMask);
26756 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
26757 SmallVector<int, 64>(NumElts, 0));
26758
26759 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
26760 DAG.getBitcast(ExtVT, R), BaseShAmt,
26761 Subtarget, DAG);
26762 Res = DAG.getBitcast(VT, Res);
26763 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
26764
26765 if (Opcode == ISD::SRA) {
26766 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
26767 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
26768 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
26769 SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
26770 BaseShAmt, Subtarget, DAG);
26771 SignMask = DAG.getBitcast(VT, SignMask);
26772 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
26773 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
26774 }
26775 return Res;
26776 }
26777 }
26778 }
26779
26780 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
26781 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
26782 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
26783 Amt = Amt.getOperand(0);
26784 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
26785 std::vector<SDValue> Vals(Ratio);
26786 for (unsigned i = 0; i != Ratio; ++i)
26787 Vals[i] = Amt.getOperand(i);
26788 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
26789 for (unsigned j = 0; j != Ratio; ++j)
26790 if (Vals[j] != Amt.getOperand(i + j))
26791 return SDValue();
26792 }
26793
26794 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
26795 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
26796 }
26797 return SDValue();
26798}
26799
26800// Convert a shift/rotate left amount to a multiplication scale factor.
26801static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
26802 const X86Subtarget &Subtarget,
26803 SelectionDAG &DAG) {
26804 MVT VT = Amt.getSimpleValueType();
26805 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
26806 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
26807 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
26808 return SDValue();
26809
26810 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
26811 SmallVector<SDValue, 8> Elts;
26812 MVT SVT = VT.getVectorElementType();
26813 unsigned SVTBits = SVT.getSizeInBits();
26814 APInt One(SVTBits, 1);
26815 unsigned NumElems = VT.getVectorNumElements();
26816
26817 for (unsigned i = 0; i != NumElems; ++i) {
26818 SDValue Op = Amt->getOperand(i);
26819 if (Op->isUndef()) {
26820 Elts.push_back(Op);
26821 continue;
26822 }
26823
26824 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
26825 APInt C(SVTBits, ND->getZExtValue());
26826 uint64_t ShAmt = C.getZExtValue();
26827 if (ShAmt >= SVTBits) {
26828 Elts.push_back(DAG.getUNDEF(SVT));
26829 continue;
26830 }
26831 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
26832 }
26833 return DAG.getBuildVector(VT, dl, Elts);
26834 }
26835
26836 // If the target doesn't support variable shifts, use either FP conversion
26837 // or integer multiplication to avoid shifting each element individually.
26838 if (VT == MVT::v4i32) {
26839 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
26840 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
26841 DAG.getConstant(0x3f800000U, dl, VT));
26842 Amt = DAG.getBitcast(MVT::v4f32, Amt);
26843 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
26844 }
26845
26846 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
26847 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
26848 SDValue Z = DAG.getConstant(0, dl, VT);
26849 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
26850 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
26851 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
26852 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
26853 if (Subtarget.hasSSE41())
26854 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
26855
26856 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
26857 DAG.getBitcast(VT, Hi),
26858 {0, 2, 4, 6, 8, 10, 12, 14});
26859 }
26860
26861 return SDValue();
26862}
26863
26864static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
26865 SelectionDAG &DAG) {
26866 MVT VT = Op.getSimpleValueType();
26867 SDLoc dl(Op);
26868 SDValue R = Op.getOperand(0);
26869 SDValue Amt = Op.getOperand(1);
26870 unsigned EltSizeInBits = VT.getScalarSizeInBits();
26871 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
26872
26873 unsigned Opc = Op.getOpcode();
26874 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
26875 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
26876
26877 assert(VT.isVector() && "Custom lowering only for vector shifts!")((VT.isVector() && "Custom lowering only for vector shifts!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26877, __PRETTY_FUNCTION__))
;
26878 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")((Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26878, __PRETTY_FUNCTION__))
;
26879
26880 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
26881 return V;
26882
26883 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
26884 return V;
26885
26886 if (SupportedVectorVarShift(VT, Subtarget, Opc))
26887 return Op;
26888
26889 // XOP has 128-bit variable logical/arithmetic shifts.
26890 // +ve/-ve Amt = shift left/right.
26891 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
26892 VT == MVT::v8i16 || VT == MVT::v16i8)) {
26893 if (Opc == ISD::SRL || Opc == ISD::SRA) {
26894 SDValue Zero = DAG.getConstant(0, dl, VT);
26895 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
26896 }
26897 if (Opc == ISD::SHL || Opc == ISD::SRL)
26898 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
26899 if (Opc == ISD::SRA)
26900 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
26901 }
26902
26903 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
26904 // shifts per-lane and then shuffle the partial results back together.
26905 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
26906 // Splat the shift amounts so the scalar shifts above will catch it.
26907 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
26908 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
26909 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
26910 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
26911 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
26912 }
26913
26914 // i64 vector arithmetic shift can be emulated with the transform:
26915 // M = lshr(SIGN_MASK, Amt)
26916 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
26917 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
26918 Opc == ISD::SRA) {
26919 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
26920 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
26921 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
26922 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
26923 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
26924 return R;
26925 }
26926
26927 // If possible, lower this shift as a sequence of two shifts by
26928 // constant plus a BLENDing shuffle instead of scalarizing it.
26929 // Example:
26930 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
26931 //
26932 // Could be rewritten as:
26933 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
26934 //
26935 // The advantage is that the two shifts from the example would be
26936 // lowered as X86ISD::VSRLI nodes in parallel before blending.
26937 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
26938 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
26939 SDValue Amt1, Amt2;
26940 unsigned NumElts = VT.getVectorNumElements();
26941 SmallVector<int, 8> ShuffleMask;
26942 for (unsigned i = 0; i != NumElts; ++i) {
26943 SDValue A = Amt->getOperand(i);
26944 if (A.isUndef()) {
26945 ShuffleMask.push_back(SM_SentinelUndef);
26946 continue;
26947 }
26948 if (!Amt1 || Amt1 == A) {
26949 ShuffleMask.push_back(i);
26950 Amt1 = A;
26951 continue;
26952 }
26953 if (!Amt2 || Amt2 == A) {
26954 ShuffleMask.push_back(i + NumElts);
26955 Amt2 = A;
26956 continue;
26957 }
26958 break;
26959 }
26960
26961 // Only perform this blend if we can perform it without loading a mask.
26962 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
26963 (VT != MVT::v16i16 ||
26964 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
26965 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
26966 canWidenShuffleElements(ShuffleMask))) {
26967 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
26968 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
26969 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
26970 Cst2->getAPIntValue().ult(EltSizeInBits)) {
26971 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
26972 Cst1->getZExtValue(), DAG);
26973 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
26974 Cst2->getZExtValue(), DAG);
26975 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
26976 }
26977 }
26978 }
26979
26980 // If possible, lower this packed shift into a vector multiply instead of
26981 // expanding it into a sequence of scalar shifts.
26982 if (Opc == ISD::SHL)
26983 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
26984 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
26985
26986 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
26987 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
26988 if (Opc == ISD::SRL && ConstantAmt &&
26989 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
26990 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
26991 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
26992 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
26993 SDValue Zero = DAG.getConstant(0, dl, VT);
26994 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
26995 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
26996 return DAG.getSelect(dl, VT, ZAmt, R, Res);
26997 }
26998 }
26999
27000 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
27001 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
27002 // TODO: Special case handling for shift by 0/1, really we can afford either
27003 // of these cases in pre-SSE41/XOP/AVX512 but not both.
27004 if (Opc == ISD::SRA && ConstantAmt &&
27005 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
27006 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
27007 !Subtarget.hasAVX512()) ||
27008 DAG.isKnownNeverZero(Amt))) {
27009 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
27010 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
27011 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
27012 SDValue Amt0 =
27013 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
27014 SDValue Amt1 =
27015 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
27016 SDValue Sra1 =
27017 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
27018 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
27019 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
27020 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
27021 }
27022 }
27023
27024 // v4i32 Non Uniform Shifts.
27025 // If the shift amount is constant we can shift each lane using the SSE2
27026 // immediate shifts, else we need to zero-extend each lane to the lower i64
27027 // and shift using the SSE2 variable shifts.
27028 // The separate results can then be blended together.
27029 if (VT == MVT::v4i32) {
27030 SDValue Amt0, Amt1, Amt2, Amt3;
27031 if (ConstantAmt) {
27032 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
27033 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
27034 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
27035 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
27036 } else {
27037 // The SSE2 shifts use the lower i64 as the same shift amount for
27038 // all lanes and the upper i64 is ignored. On AVX we're better off
27039 // just zero-extending, but for SSE just duplicating the top 16-bits is
27040 // cheaper and has the same effect for out of range values.
27041 if (Subtarget.hasAVX()) {
27042 SDValue Z = DAG.getConstant(0, dl, VT);
27043 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
27044 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
27045 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
27046 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
27047 } else {
27048 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
27049 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
27050 {4, 5, 6, 7, -1, -1, -1, -1});
27051 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
27052 {0, 1, 1, 1, -1, -1, -1, -1});
27053 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
27054 {2, 3, 3, 3, -1, -1, -1, -1});
27055 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
27056 {0, 1, 1, 1, -1, -1, -1, -1});
27057 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
27058 {2, 3, 3, 3, -1, -1, -1, -1});
27059 }
27060 }
27061
27062 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
27063 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
27064 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
27065 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
27066 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
27067
27068 // Merge the shifted lane results optimally with/without PBLENDW.
27069 // TODO - ideally shuffle combining would handle this.
27070 if (Subtarget.hasSSE41()) {
27071 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
27072 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
27073 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
27074 }
27075 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
27076 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
27077 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
27078 }
27079
27080 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
27081 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
27082 // make the existing SSE solution better.
27083 // NOTE: We honor prefered vector width before promoting to 512-bits.
27084 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
27085 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
27086 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
27087 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
27088 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
27089 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8
) && "Unexpected vector type") ? static_cast<void>
(0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27090, __PRETTY_FUNCTION__))
27090 "Unexpected vector type")(((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8
) && "Unexpected vector type") ? static_cast<void>
(0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27090, __PRETTY_FUNCTION__))
;
27091 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
27092 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
27093 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27094 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
27095 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
27096 return DAG.getNode(ISD::TRUNCATE, dl, VT,
27097 DAG.getNode(Opc, dl, ExtVT, R, Amt));
27098 }
27099
27100 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
27101 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
27102 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
27103 (VT == MVT::v16i8 || VT == MVT::v64i8 ||
27104 (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
27105 !Subtarget.hasXOP()) {
27106 int NumElts = VT.getVectorNumElements();
27107 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
27108
27109 // Extend constant shift amount to vXi16 (it doesn't matter if the type
27110 // isn't legal).
27111 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
27112 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
27113 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
27114 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
27115 assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected") ? static_cast<void> (
0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27116, __PRETTY_FUNCTION__))
27116 "Constant build vector expected")((ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
"Constant build vector expected") ? static_cast<void> (
0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27116, __PRETTY_FUNCTION__))
;
27117
27118 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
27119 R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
27120 : DAG.getZExtOrTrunc(R, dl, ExVT);
27121 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
27122 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
27123 return DAG.getZExtOrTrunc(R, dl, VT);
27124 }
27125
27126 SmallVector<SDValue, 16> LoAmt, HiAmt;
27127 for (int i = 0; i != NumElts; i += 16) {
27128 for (int j = 0; j != 8; ++j) {
27129 LoAmt.push_back(Amt.getOperand(i + j));
27130 HiAmt.push_back(Amt.getOperand(i + j + 8));
27131 }
27132 }
27133
27134 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
27135 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
27136 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
27137
27138 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
27139 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
27140 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
27141 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
27142 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
27143 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
27144 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
27145 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
27146 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
27147 }
27148
27149 if (VT == MVT::v16i8 ||
27150 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
27151 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
27152 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
27153
27154 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
27155 if (VT.is512BitVector()) {
27156 // On AVX512BW targets we make use of the fact that VSELECT lowers
27157 // to a masked blend which selects bytes based just on the sign bit
27158 // extracted to a mask.
27159 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
27160 V0 = DAG.getBitcast(VT, V0);
27161 V1 = DAG.getBitcast(VT, V1);
27162 Sel = DAG.getBitcast(VT, Sel);
27163 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
27164 ISD::SETGT);
27165 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
27166 } else if (Subtarget.hasSSE41()) {
27167 // On SSE41 targets we make use of the fact that VSELECT lowers
27168 // to PBLENDVB which selects bytes based just on the sign bit.
27169 V0 = DAG.getBitcast(VT, V0);
27170 V1 = DAG.getBitcast(VT, V1);
27171 Sel = DAG.getBitcast(VT, Sel);
27172 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
27173 }
27174 // On pre-SSE41 targets we test for the sign bit by comparing to
27175 // zero - a negative value will set all bits of the lanes to true
27176 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
27177 SDValue Z = DAG.getConstant(0, dl, SelVT);
27178 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
27179 return DAG.getSelect(dl, SelVT, C, V0, V1);
27180 };
27181
27182 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
27183 // We can safely do this using i16 shifts as we're only interested in
27184 // the 3 lower bits of each byte.
27185 Amt = DAG.getBitcast(ExtVT, Amt);
27186 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
27187 Amt = DAG.getBitcast(VT, Amt);
27188
27189 if (Opc == ISD::SHL || Opc == ISD::SRL) {
27190 // r = VSELECT(r, shift(r, 4), a);
27191 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
27192 R = SignBitSelect(VT, Amt, M, R);
27193
27194 // a += a
27195 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
27196
27197 // r = VSELECT(r, shift(r, 2), a);
27198 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
27199 R = SignBitSelect(VT, Amt, M, R);
27200
27201 // a += a
27202 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
27203
27204 // return VSELECT(r, shift(r, 1), a);
27205 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
27206 R = SignBitSelect(VT, Amt, M, R);
27207 return R;
27208 }
27209
27210 if (Opc == ISD::SRA) {
27211 // For SRA we need to unpack each byte to the higher byte of a i16 vector
27212 // so we can correctly sign extend. We don't care what happens to the
27213 // lower byte.
27214 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
27215 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
27216 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
27217 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
27218 ALo = DAG.getBitcast(ExtVT, ALo);
27219 AHi = DAG.getBitcast(ExtVT, AHi);
27220 RLo = DAG.getBitcast(ExtVT, RLo);
27221 RHi = DAG.getBitcast(ExtVT, RHi);
27222
27223 // r = VSELECT(r, shift(r, 4), a);
27224 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
27225 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
27226 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
27227 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
27228
27229 // a += a
27230 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
27231 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
27232
27233 // r = VSELECT(r, shift(r, 2), a);
27234 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
27235 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
27236 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
27237 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
27238
27239 // a += a
27240 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
27241 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
27242
27243 // r = VSELECT(r, shift(r, 1), a);
27244 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
27245 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
27246 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
27247 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
27248
27249 // Logical shift the result back to the lower byte, leaving a zero upper
27250 // byte meaning that we can safely pack with PACKUSWB.
27251 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
27252 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
27253 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
27254 }
27255 }
27256
27257 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
27258 MVT ExtVT = MVT::v8i32;
27259 SDValue Z = DAG.getConstant(0, dl, VT);
27260 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
27261 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
27262 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
27263 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
27264 ALo = DAG.getBitcast(ExtVT, ALo);
27265 AHi = DAG.getBitcast(ExtVT, AHi);
27266 RLo = DAG.getBitcast(ExtVT, RLo);
27267 RHi = DAG.getBitcast(ExtVT, RHi);
27268 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
27269 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
27270 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
27271 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
27272 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
27273 }
27274
27275 if (VT == MVT::v8i16) {
27276 // If we have a constant shift amount, the non-SSE41 path is best as
27277 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
27278 bool UseSSE41 = Subtarget.hasSSE41() &&
27279 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
27280
27281 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
27282 // On SSE41 targets we make use of the fact that VSELECT lowers
27283 // to PBLENDVB which selects bytes based just on the sign bit.
27284 if (UseSSE41) {
27285 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
27286 V0 = DAG.getBitcast(ExtVT, V0);
27287 V1 = DAG.getBitcast(ExtVT, V1);
27288 Sel = DAG.getBitcast(ExtVT, Sel);
27289 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
27290 }
27291 // On pre-SSE41 targets we splat the sign bit - a negative value will
27292 // set all bits of the lanes to true and VSELECT uses that in
27293 // its OR(AND(V0,C),AND(V1,~C)) lowering.
27294 SDValue C =
27295 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
27296 return DAG.getSelect(dl, VT, C, V0, V1);
27297 };
27298
27299 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
27300 if (UseSSE41) {
27301 // On SSE41 targets we need to replicate the shift mask in both
27302 // bytes for PBLENDVB.
27303 Amt = DAG.getNode(
27304 ISD::OR, dl, VT,
27305 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
27306 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
27307 } else {
27308 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
27309 }
27310
27311 // r = VSELECT(r, shift(r, 8), a);
27312 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
27313 R = SignBitSelect(Amt, M, R);
27314
27315 // a += a
27316 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
27317
27318 // r = VSELECT(r, shift(r, 4), a);
27319 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
27320 R = SignBitSelect(Amt, M, R);
27321
27322 // a += a
27323 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
27324
27325 // r = VSELECT(r, shift(r, 2), a);
27326 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
27327 R = SignBitSelect(Amt, M, R);
27328
27329 // a += a
27330 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
27331
27332 // return VSELECT(r, shift(r, 1), a);
27333 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
27334 R = SignBitSelect(Amt, M, R);
27335 return R;
27336 }
27337
27338 // Decompose 256-bit shifts into 128-bit shifts.
27339 if (VT.is256BitVector())
27340 return split256IntArith(Op, DAG);
27341
27342 return SDValue();
27343}
27344
27345static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
27346 SelectionDAG &DAG) {
27347 MVT VT = Op.getSimpleValueType();
27348 assert(VT.isVector() && "Custom lowering only for vector rotates!")((VT.isVector() && "Custom lowering only for vector rotates!"
) ? static_cast<void> (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27348, __PRETTY_FUNCTION__))
;
27349
27350 SDLoc DL(Op);
27351 SDValue R = Op.getOperand(0);
27352 SDValue Amt = Op.getOperand(1);
27353 unsigned Opcode = Op.getOpcode();
27354 unsigned EltSizeInBits = VT.getScalarSizeInBits();
27355 int NumElts = VT.getVectorNumElements();
27356
27357 // Check for constant splat rotation amount.
27358 APInt UndefElts;
27359 SmallVector<APInt, 32> EltBits;
27360 int CstSplatIndex = -1;
27361 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
27362 for (int i = 0; i != NumElts; ++i)
27363 if (!UndefElts[i]) {
27364 if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {
27365 CstSplatIndex = i;
27366 continue;
27367 }
27368 CstSplatIndex = -1;
27369 break;
27370 }
27371
27372 // Check for splat rotate by zero.
27373 if (0 <= CstSplatIndex && EltBits[CstSplatIndex].urem(EltSizeInBits) == 0)
27374 return R;
27375
27376 // AVX512 implicitly uses modulo rotation amounts.
27377 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
27378 // Attempt to rotate by immediate.
27379 if (0 <= CstSplatIndex) {
27380 unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
27381 uint64_t RotAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
27382 return DAG.getNode(RotOpc, DL, VT, R,
27383 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
27384 }
27385
27386 // Else, fall-back on VPROLV/VPRORV.
27387 return Op;
27388 }
27389
27390 assert((Opcode == ISD::ROTL) && "Only ROTL supported")(((Opcode == ISD::ROTL) && "Only ROTL supported") ? static_cast
<void> (0) : __assert_fail ("(Opcode == ISD::ROTL) && \"Only ROTL supported\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27390, __PRETTY_FUNCTION__))
;
27391
27392 // XOP has 128-bit vector variable + immediate rotates.
27393 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
27394 // XOP implicitly uses modulo rotation amounts.
27395 if (Subtarget.hasXOP()) {
27396 if (VT.is256BitVector())
27397 return split256IntArith(Op, DAG);
27398 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")((VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27398, __PRETTY_FUNCTION__))
;
27399
27400 // Attempt to rotate by immediate.
27401 if (0 <= CstSplatIndex) {
27402 uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
27403 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
27404 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
27405 }
27406
27407 // Use general rotate by variable (per-element).
27408 return Op;
27409 }
27410
27411 // Split 256-bit integers on pre-AVX2 targets.
27412 if (VT.is256BitVector() && !Subtarget.hasAVX2())
27413 return split256IntArith(Op, DAG);
27414
27415 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27418, __PRETTY_FUNCTION__))
27416 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27418, __PRETTY_FUNCTION__))
27417 Subtarget.hasAVX2())) &&(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27418, __PRETTY_FUNCTION__))
27418 "Only vXi32/vXi16/vXi8 vector rotates supported")(((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8)
&& Subtarget.hasAVX2())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? static_cast<void> (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27418, __PRETTY_FUNCTION__))
;
27419
27420 // Rotate by an uniform constant - expand back to shifts.
27421 if (0 <= CstSplatIndex)
27422 return SDValue();
27423
27424 bool IsSplatAmt = DAG.isSplatValue(Amt);
27425
27426 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
27427 // the amount bit.
27428 if (EltSizeInBits == 8 && !IsSplatAmt) {
27429 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
27430 return SDValue();
27431
27432 // We don't need ModuloAmt here as we just peek at individual bits.
27433 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
27434
27435 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
27436 if (Subtarget.hasSSE41()) {
27437 // On SSE41 targets we make use of the fact that VSELECT lowers
27438 // to PBLENDVB which selects bytes based just on the sign bit.
27439 V0 = DAG.getBitcast(VT, V0);
27440 V1 = DAG.getBitcast(VT, V1);
27441 Sel = DAG.getBitcast(VT, Sel);
27442 return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
27443 }
27444 // On pre-SSE41 targets we test for the sign bit by comparing to
27445 // zero - a negative value will set all bits of the lanes to true
27446 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
27447 SDValue Z = DAG.getConstant(0, DL, SelVT);
27448 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
27449 return DAG.getSelect(DL, SelVT, C, V0, V1);
27450 };
27451
27452 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
27453 // We can safely do this using i16 shifts as we're only interested in
27454 // the 3 lower bits of each byte.
27455 Amt = DAG.getBitcast(ExtVT, Amt);
27456 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
27457 Amt = DAG.getBitcast(VT, Amt);
27458
27459 // r = VSELECT(r, rot(r, 4), a);
27460 SDValue M;
27461 M = DAG.getNode(
27462 ISD::OR, DL, VT,
27463 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
27464 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
27465 R = SignBitSelect(VT, Amt, M, R);
27466
27467 // a += a
27468 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
27469
27470 // r = VSELECT(r, rot(r, 2), a);
27471 M = DAG.getNode(
27472 ISD::OR, DL, VT,
27473 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
27474 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
27475 R = SignBitSelect(VT, Amt, M, R);
27476
27477 // a += a
27478 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
27479
27480 // return VSELECT(r, rot(r, 1), a);
27481 M = DAG.getNode(
27482 ISD::OR, DL, VT,
27483 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
27484 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
27485 return SignBitSelect(VT, Amt, M, R);
27486 }
27487
27488 // ISD::ROT* uses modulo rotate amounts.
27489 Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
27490 DAG.getConstant(EltSizeInBits - 1, DL, VT));
27491
27492 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
27493 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
27494 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
27495
27496 // Fallback for splats + all supported variable shifts.
27497 // Fallback for non-constants AVX2 vXi16 as well.
27498 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
27499 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
27500 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
27501 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
27502 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
27503 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
27504 }
27505
27506 // As with shifts, convert the rotation amount to a multiplication factor.
27507 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
27508 assert(Scale && "Failed to convert ROTL amount to scale")((Scale && "Failed to convert ROTL amount to scale") ?
static_cast<void> (0) : __assert_fail ("Scale && \"Failed to convert ROTL amount to scale\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27508, __PRETTY_FUNCTION__))
;
27509
27510 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
27511 if (EltSizeInBits == 16) {
27512 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
27513 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
27514 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
27515 }
27516
27517 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
27518 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
27519 // that can then be OR'd with the lower 32-bits.
27520 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")((VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27520, __PRETTY_FUNCTION__))
;
27521 static const int OddMask[] = {1, -1, 3, -1};
27522 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
27523 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
27524
27525 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
27526 DAG.getBitcast(MVT::v2i64, R),
27527 DAG.getBitcast(MVT::v2i64, Scale));
27528 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
27529 DAG.getBitcast(MVT::v2i64, R13),
27530 DAG.getBitcast(MVT::v2i64, Scale13));
27531 Res02 = DAG.getBitcast(VT, Res02);
27532 Res13 = DAG.getBitcast(VT, Res13);
27533
27534 return DAG.getNode(ISD::OR, DL, VT,
27535 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
27536 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
27537}
27538
27539/// Returns true if the operand type is exactly twice the native width, and
27540/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
27541/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
27542/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
27543bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
27544 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
27545
27546 if (OpWidth == 64)
27547 return Subtarget.hasCmpxchg8b() && !Subtarget.is64Bit();
27548 if (OpWidth == 128)
27549 return Subtarget.hasCmpxchg16b();
27550
27551 return false;
27552}
27553
27554bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
27555 Type *MemType = SI->getValueOperand()->getType();
27556
27557 bool NoImplicitFloatOps =
27558 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
27559 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
27560 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
27561 (Subtarget.hasSSE1() || Subtarget.hasX87()))
27562 return false;
27563
27564 return needsCmpXchgNb(MemType);
27565}
27566
27567// Note: this turns large loads into lock cmpxchg8b/16b.
27568// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
27569TargetLowering::AtomicExpansionKind
27570X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
27571 Type *MemType = LI->getType();
27572
27573 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
27574 // can use movq to do the load. If we have X87 we can load into an 80-bit
27575 // X87 register and store it to a stack temporary.
27576 bool NoImplicitFloatOps =
27577 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
27578 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
27579 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
27580 (Subtarget.hasSSE1() || Subtarget.hasX87()))
27581 return AtomicExpansionKind::None;
27582
27583 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
27584 : AtomicExpansionKind::None;
27585}
27586
27587TargetLowering::AtomicExpansionKind
27588X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
27589 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
27590 Type *MemType = AI->getType();
27591
27592 // If the operand is too big, we must see if cmpxchg8/16b is available
27593 // and default to library calls otherwise.
27594 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
27595 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
27596 : AtomicExpansionKind::None;
27597 }
27598
27599 AtomicRMWInst::BinOp Op = AI->getOperation();
27600 switch (Op) {
27601 default:
27602 llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27602)
;
27603 case AtomicRMWInst::Xchg:
27604 case AtomicRMWInst::Add:
27605 case AtomicRMWInst::Sub:
27606 // It's better to use xadd, xsub or xchg for these in all cases.
27607 return AtomicExpansionKind::None;
27608 case AtomicRMWInst::Or:
27609 case AtomicRMWInst::And:
27610 case AtomicRMWInst::Xor:
27611 // If the atomicrmw's result isn't actually used, we can just add a "lock"
27612 // prefix to a normal instruction for these operations.
27613 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
27614 : AtomicExpansionKind::None;
27615 case AtomicRMWInst::Nand:
27616 case AtomicRMWInst::Max:
27617 case AtomicRMWInst::Min:
27618 case AtomicRMWInst::UMax:
27619 case AtomicRMWInst::UMin:
27620 case AtomicRMWInst::FAdd:
27621 case AtomicRMWInst::FSub:
27622 // These always require a non-trivial set of data operations on x86. We must
27623 // use a cmpxchg loop.
27624 return AtomicExpansionKind::CmpXChg;
27625 }
27626}
27627
27628LoadInst *
27629X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
27630 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
27631 Type *MemType = AI->getType();
27632 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
27633 // there is no benefit in turning such RMWs into loads, and it is actually
27634 // harmful as it introduces a mfence.
27635 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
27636 return nullptr;
27637
27638 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
27639 // lowering available in lowerAtomicArith.
27640 // TODO: push more cases through this path.
27641 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
27642 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
27643 AI->use_empty())
27644 return nullptr;
27645
27646 IRBuilder<> Builder(AI);
27647 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
27648 auto SSID = AI->getSyncScopeID();
27649 // We must restrict the ordering to avoid generating loads with Release or
27650 // ReleaseAcquire orderings.
27651 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
27652
27653 // Before the load we need a fence. Here is an example lifted from
27654 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
27655 // is required:
27656 // Thread 0:
27657 // x.store(1, relaxed);
27658 // r1 = y.fetch_add(0, release);
27659 // Thread 1:
27660 // y.fetch_add(42, acquire);
27661 // r2 = x.load(relaxed);
27662 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
27663 // lowered to just a load without a fence. A mfence flushes the store buffer,
27664 // making the optimization clearly correct.
27665 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
27666 // otherwise, we might be able to be more aggressive on relaxed idempotent
27667 // rmw. In practice, they do not look useful, so we don't try to be
27668 // especially clever.
27669 if (SSID == SyncScope::SingleThread)
27670 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
27671 // the IR level, so we must wrap it in an intrinsic.
27672 return nullptr;
27673
27674 if (!Subtarget.hasMFence())
27675 // FIXME: it might make sense to use a locked operation here but on a
27676 // different cache-line to prevent cache-line bouncing. In practice it
27677 // is probably a small win, and x86 processors without mfence are rare
27678 // enough that we do not bother.
27679 return nullptr;
27680
27681 Function *MFence =
27682 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
27683 Builder.CreateCall(MFence, {});
27684
27685 // Finally we can emit the atomic load.
27686 LoadInst *Loaded =
27687 Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
27688 Align(AI->getType()->getPrimitiveSizeInBits()));
27689 Loaded->setAtomic(Order, SSID);
27690 AI->replaceAllUsesWith(Loaded);
27691 AI->eraseFromParent();
27692 return Loaded;
27693}
27694
27695bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {
27696 if (!SI.isUnordered())
27697 return false;
27698 return ExperimentalUnorderedISEL;
27699}
27700bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {
27701 if (!LI.isUnordered())
27702 return false;
27703 return ExperimentalUnorderedISEL;
27704}
27705
27706
27707/// Emit a locked operation on a stack location which does not change any
27708/// memory location, but does involve a lock prefix. Location is chosen to be
27709/// a) very likely accessed only by a single thread to minimize cache traffic,
27710/// and b) definitely dereferenceable. Returns the new Chain result.
27711static SDValue emitLockedStackOp(SelectionDAG &DAG,
27712 const X86Subtarget &Subtarget,
27713 SDValue Chain, SDLoc DL) {
27714 // Implementation notes:
27715 // 1) LOCK prefix creates a full read/write reordering barrier for memory
27716 // operations issued by the current processor. As such, the location
27717 // referenced is not relevant for the ordering properties of the instruction.
27718 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
27719 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
27720 // 2) Using an immediate operand appears to be the best encoding choice
27721 // here since it doesn't require an extra register.
27722 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
27723 // is small enough it might just be measurement noise.)
27724 // 4) When choosing offsets, there are several contributing factors:
27725 // a) If there's no redzone, we default to TOS. (We could allocate a cache
27726 // line aligned stack object to improve this case.)
27727 // b) To minimize our chances of introducing a false dependence, we prefer
27728 // to offset the stack usage from TOS slightly.
27729 // c) To minimize concerns about cross thread stack usage - in particular,
27730 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
27731 // captures state in the TOS frame and accesses it from many threads -
27732 // we want to use an offset such that the offset is in a distinct cache
27733 // line from the TOS frame.
27734 //
27735 // For a general discussion of the tradeoffs and benchmark results, see:
27736 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
27737
27738 auto &MF = DAG.getMachineFunction();
27739 auto &TFL = *Subtarget.getFrameLowering();
27740 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
27741
27742 if (Subtarget.is64Bit()) {
27743 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
27744 SDValue Ops[] = {
27745 DAG.getRegister(X86::RSP, MVT::i64), // Base
27746 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27747 DAG.getRegister(0, MVT::i64), // Index
27748 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
27749 DAG.getRegister(0, MVT::i16), // Segment.
27750 Zero,
27751 Chain};
27752 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
27753 MVT::Other, Ops);
27754 return SDValue(Res, 1);
27755 }
27756
27757 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
27758 SDValue Ops[] = {
27759 DAG.getRegister(X86::ESP, MVT::i32), // Base
27760 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27761 DAG.getRegister(0, MVT::i32), // Index
27762 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
27763 DAG.getRegister(0, MVT::i16), // Segment.
27764 Zero,
27765 Chain
27766 };
27767 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
27768 MVT::Other, Ops);
27769 return SDValue(Res, 1);
27770}
27771
27772static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
27773 SelectionDAG &DAG) {
27774 SDLoc dl(Op);
27775 AtomicOrdering FenceOrdering =
27776 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
27777 SyncScope::ID FenceSSID =
27778 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
27779
27780 // The only fence that needs an instruction is a sequentially-consistent
27781 // cross-thread fence.
27782 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
27783 FenceSSID == SyncScope::System) {
27784 if (Subtarget.hasMFence())
27785 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
27786
27787 SDValue Chain = Op.getOperand(0);
27788 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
27789 }
27790
27791 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
27792 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
27793}
27794
27795static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
27796 SelectionDAG &DAG) {
27797 MVT T = Op.getSimpleValueType();
27798 SDLoc DL(Op);
27799 unsigned Reg = 0;
27800 unsigned size = 0;
27801 switch(T.SimpleTy) {
27802 default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27802)
;
27803 case MVT::i8: Reg = X86::AL; size = 1; break;
27804 case MVT::i16: Reg = X86::AX; size = 2; break;
27805 case MVT::i32: Reg = X86::EAX; size = 4; break;
27806 case MVT::i64:
27807 assert(Subtarget.is64Bit() && "Node not type legal!")((Subtarget.is64Bit() && "Node not type legal!") ? static_cast
<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27807, __PRETTY_FUNCTION__))
;
27808 Reg = X86::RAX; size = 8;
27809 break;
27810 }
27811 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
27812 Op.getOperand(2), SDValue());
27813 SDValue Ops[] = { cpIn.getValue(0),
27814 Op.getOperand(1),
27815 Op.getOperand(3),
27816 DAG.getTargetConstant(size, DL, MVT::i8),
27817 cpIn.getValue(1) };
27818 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27819 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
27820 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
27821 Ops, T, MMO);
27822
27823 SDValue cpOut =
27824 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
27825 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
27826 MVT::i32, cpOut.getValue(2));
27827 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
27828
27829 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27830 cpOut, Success, EFLAGS.getValue(1));
27831}
27832
27833// Create MOVMSKB, taking into account whether we need to split for AVX1.
27834static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
27835 const X86Subtarget &Subtarget) {
27836 MVT InVT = V.getSimpleValueType();
27837
27838 if (InVT == MVT::v64i8) {
27839 SDValue Lo, Hi;
27840 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
27841 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
27842 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
27843 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
27844 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
27845 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
27846 DAG.getConstant(32, DL, MVT::i8));
27847 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
27848 }
27849 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
27850 SDValue Lo, Hi;
27851 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
27852 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
27853 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
27854 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
27855 DAG.getConstant(16, DL, MVT::i8));
27856 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
27857 }
27858
27859 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
27860}
27861
27862static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
27863 SelectionDAG &DAG) {
27864 SDValue Src = Op.getOperand(0);
27865 MVT SrcVT = Src.getSimpleValueType();
27866 MVT DstVT = Op.getSimpleValueType();
27867
27868 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
27869 // half to v32i1 and concatenating the result.
27870 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
27871 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((!Subtarget.is64Bit() && "Expected 32-bit mode") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27871, __PRETTY_FUNCTION__))
;
27872 assert(Subtarget.hasBWI() && "Expected BWI target")((Subtarget.hasBWI() && "Expected BWI target") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27872, __PRETTY_FUNCTION__))
;
27873 SDLoc dl(Op);
27874 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
27875 DAG.getIntPtrConstant(0, dl));
27876 Lo = DAG.getBitcast(MVT::v32i1, Lo);
27877 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
27878 DAG.getIntPtrConstant(1, dl));
27879 Hi = DAG.getBitcast(MVT::v32i1, Hi);
27880 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
27881 }
27882
27883 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
27884 if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
27885 DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
27886 SDLoc dl(Op);
27887 SDValue Lo, Hi;
27888 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
27889 MVT CastVT = DstVT.getHalfNumVectorElementsVT();
27890 Lo = DAG.getBitcast(CastVT, Lo);
27891 Hi = DAG.getBitcast(CastVT, Hi);
27892 return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
27893 }
27894
27895 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
27896 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
27897 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")((!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27897, __PRETTY_FUNCTION__))
;
27898 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
27899 SDLoc DL(Op);
27900 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
27901 V = getPMOVMSKB(DL, V, DAG, Subtarget);
27902 return DAG.getZExtOrTrunc(V, DL, DstVT);
27903 }
27904
27905 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT
::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27906, __PRETTY_FUNCTION__))
27906 SrcVT == MVT::i64) && "Unexpected VT!")(((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT
::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27906, __PRETTY_FUNCTION__))
;
27907
27908 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27908, __PRETTY_FUNCTION__))
;
27909 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
27910 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
27911 // This conversion needs to be expanded.
27912 return SDValue();
27913
27914 SDLoc dl(Op);
27915 if (SrcVT.isVector()) {
27916 // Widen the vector in input in the case of MVT::v2i32.
27917 // Example: from MVT::v2i32 to MVT::v4i32.
27918 MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
27919 SrcVT.getVectorNumElements() * 2);
27920 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
27921 DAG.getUNDEF(SrcVT));
27922 } else {
27923 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&((SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST") ? static_cast<void
> (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27924, __PRETTY_FUNCTION__))
27924 "Unexpected source type in LowerBITCAST")((SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST") ? static_cast<void
> (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27924, __PRETTY_FUNCTION__))
;
27925 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
27926 }
27927
27928 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
27929 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
27930
27931 if (DstVT == MVT::x86mmx)
27932 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
27933
27934 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
27935 DAG.getIntPtrConstant(0, dl));
27936}
27937
27938/// Compute the horizontal sum of bytes in V for the elements of VT.
27939///
27940/// Requires V to be a byte vector and VT to be an integer vector type with
27941/// wider elements than V's type. The width of the elements of VT determines
27942/// how many bytes of V are summed horizontally to produce each element of the
27943/// result.
27944static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
27945 const X86Subtarget &Subtarget,
27946 SelectionDAG &DAG) {
27947 SDLoc DL(V);
27948 MVT ByteVecVT = V.getSimpleValueType();
27949 MVT EltVT = VT.getVectorElementType();
27950 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27951, __PRETTY_FUNCTION__))
27951 "Expected value to have byte element type.")((ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type."
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27951, __PRETTY_FUNCTION__))
;
27952 assert(EltVT != MVT::i8 &&((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27953, __PRETTY_FUNCTION__))
27953 "Horizontal byte sum only makes sense for wider elements!")((EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? static_cast<void> (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27953, __PRETTY_FUNCTION__))
;
27954 unsigned VecSize = VT.getSizeInBits();
27955 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")((ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!"
) ? static_cast<void> (0) : __assert_fail ("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27955, __PRETTY_FUNCTION__))
;
27956
27957 // PSADBW instruction horizontally add all bytes and leave the result in i64
27958 // chunks, thus directly computes the pop count for v2i64 and v4i64.
27959 if (EltVT == MVT::i64) {
27960 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
27961 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
27962 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
27963 return DAG.getBitcast(VT, V);
27964 }
27965
27966 if (EltVT == MVT::i32) {
27967 // We unpack the low half and high half into i32s interleaved with zeros so
27968 // that we can use PSADBW to horizontally sum them. The most useful part of
27969 // this is that it lines up the results of two PSADBW instructions to be
27970 // two v2i64 vectors which concatenated are the 4 population counts. We can
27971 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
27972 SDValue Zeros = DAG.getConstant(0, DL, VT);
27973 SDValue V32 = DAG.getBitcast(VT, V);
27974 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
27975 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
27976
27977 // Do the horizontal sums into two v2i64s.
27978 Zeros = DAG.getConstant(0, DL, ByteVecVT);
27979 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
27980 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
27981 DAG.getBitcast(ByteVecVT, Low), Zeros);
27982 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
27983 DAG.getBitcast(ByteVecVT, High), Zeros);
27984
27985 // Merge them together.
27986 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
27987 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
27988 DAG.getBitcast(ShortVecVT, Low),
27989 DAG.getBitcast(ShortVecVT, High));
27990
27991 return DAG.getBitcast(VT, V);
27992 }
27993
27994 // The only element type left is i16.
27995 assert(EltVT == MVT::i16 && "Unknown how to handle type")((EltVT == MVT::i16 && "Unknown how to handle type") ?
static_cast<void> (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27995, __PRETTY_FUNCTION__))
;
27996
27997 // To obtain pop count for each i16 element starting from the pop count for
27998 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
27999 // right by 8. It is important to shift as i16s as i8 vector shift isn't
28000 // directly supported.
28001 SDValue ShifterV = DAG.getConstant(8, DL, VT);
28002 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
28003 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
28004 DAG.getBitcast(ByteVecVT, V));
28005 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
28006}
28007
28008static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
28009 const X86Subtarget &Subtarget,
28010 SelectionDAG &DAG) {
28011 MVT VT = Op.getSimpleValueType();
28012 MVT EltVT = VT.getVectorElementType();
28013 int NumElts = VT.getVectorNumElements();
28014 (void)EltVT;
28015 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")((EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? static_cast<void> (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28015, __PRETTY_FUNCTION__))
;
28016
28017 // Implement a lookup table in register by using an algorithm based on:
28018 // http://wm.ite.pl/articles/sse-popcount.html
28019 //
28020 // The general idea is that every lower byte nibble in the input vector is an
28021 // index into a in-register pre-computed pop count table. We then split up the
28022 // input vector in two new ones: (1) a vector with only the shifted-right
28023 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
28024 // masked out higher ones) for each byte. PSHUFB is used separately with both
28025 // to index the in-register table. Next, both are added and the result is a
28026 // i8 vector where each element contains the pop count for input byte.
28027 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
28028 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
28029 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
28030 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
28031
28032 SmallVector<SDValue, 64> LUTVec;
28033 for (int i = 0; i < NumElts; ++i)
28034 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
28035 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
28036 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
28037
28038 // High nibbles
28039 SDValue FourV = DAG.getConstant(4, DL, VT);
28040 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
28041
28042 // Low nibbles
28043 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
28044
28045 // The input vector is used as the shuffle mask that index elements into the
28046 // LUT. After counting low and high nibbles, add the vector to obtain the
28047 // final pop count per i8 element.
28048 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
28049 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
28050 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
28051}
28052
28053// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
28054// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
28055static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
28056 SelectionDAG &DAG) {
28057 MVT VT = Op.getSimpleValueType();
28058 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector
()) && "Unknown CTPOP type to handle") ? static_cast<
void> (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28059, __PRETTY_FUNCTION__))
28059 "Unknown CTPOP type to handle")(((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector
()) && "Unknown CTPOP type to handle") ? static_cast<
void> (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28059, __PRETTY_FUNCTION__))
;
28060 SDLoc DL(Op.getNode());
28061 SDValue Op0 = Op.getOperand(0);
28062
28063 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
28064 if (Subtarget.hasVPOPCNTDQ()) {
28065 unsigned NumElems = VT.getVectorNumElements();
28066 assert((VT.getVectorElementType() == MVT::i8 ||(((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType
() == MVT::i16) && "Unexpected type") ? static_cast<
void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28067, __PRETTY_FUNCTION__))
28067 VT.getVectorElementType() == MVT::i16) && "Unexpected type")(((VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType
() == MVT::i16) && "Unexpected type") ? static_cast<
void> (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28067, __PRETTY_FUNCTION__))
;
28068 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
28069 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28070 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
28071 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
28072 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
28073 }
28074 }
28075
28076 // Decompose 256-bit ops into smaller 128-bit ops.
28077 if (VT.is256BitVector() && !Subtarget.hasInt256())
28078 return Lower256IntUnary(Op, DAG);
28079
28080 // Decompose 512-bit ops into smaller 256-bit ops.
28081 if (VT.is512BitVector() && !Subtarget.hasBWI())
28082 return Lower512IntUnary(Op, DAG);
28083
28084 // For element types greater than i8, do vXi8 pop counts and a bytesum.
28085 if (VT.getScalarType() != MVT::i8) {
28086 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
28087 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
28088 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
28089 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
28090 }
28091
28092 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
28093 if (!Subtarget.hasSSSE3())
28094 return SDValue();
28095
28096 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
28097}
28098
28099static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
28100 SelectionDAG &DAG) {
28101 assert(Op.getSimpleValueType().isVector() &&((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28102, __PRETTY_FUNCTION__))
28102 "We only do custom lowering for vector population count.")((Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."
) ? static_cast<void> (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28102, __PRETTY_FUNCTION__))
;
28103 return LowerVectorCTPOP(Op, Subtarget, DAG);
28104}
28105
28106static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
28107 MVT VT = Op.getSimpleValueType();
28108 SDValue In = Op.getOperand(0);
28109 SDLoc DL(Op);
28110
28111 // For scalars, its still beneficial to transfer to/from the SIMD unit to
28112 // perform the BITREVERSE.
28113 if (!VT.isVector()) {
28114 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
28115 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
28116 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
28117 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
28118 DAG.getIntPtrConstant(0, DL));
28119 }
28120
28121 int NumElts = VT.getVectorNumElements();
28122 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
28123
28124 // Decompose 256-bit ops into smaller 128-bit ops.
28125 if (VT.is256BitVector())
28126 return Lower256IntUnary(Op, DAG);
28127
28128 assert(VT.is128BitVector() &&((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28129, __PRETTY_FUNCTION__))
28129 "Only 128-bit vector bitreverse lowering supported.")((VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? static_cast<void> (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28129, __PRETTY_FUNCTION__))
;
28130
28131 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
28132 // perform the BSWAP in the shuffle.
28133 // Its best to shuffle using the second operand as this will implicitly allow
28134 // memory folding for multiple vectors.
28135 SmallVector<SDValue, 16> MaskElts;
28136 for (int i = 0; i != NumElts; ++i) {
28137 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
28138 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
28139 int PermuteByte = SourceByte | (2 << 5);
28140 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
28141 }
28142 }
28143
28144 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
28145 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
28146 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
28147 Res, Mask);
28148 return DAG.getBitcast(VT, Res);
28149}
28150
28151static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
28152 SelectionDAG &DAG) {
28153 MVT VT = Op.getSimpleValueType();
28154
28155 if (Subtarget.hasXOP() && !VT.is512BitVector())
28156 return LowerBITREVERSE_XOP(Op, DAG);
28157
28158 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")((Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28158, __PRETTY_FUNCTION__))
;
28159
28160 SDValue In = Op.getOperand(0);
28161 SDLoc DL(Op);
28162
28163 // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
28164 // lowering.
28165 if (VT == MVT::v8i64 || VT == MVT::v16i32) {
28166 assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE")((!Subtarget.hasBWI() && "BWI should Expand BITREVERSE"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.hasBWI() && \"BWI should Expand BITREVERSE\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28166, __PRETTY_FUNCTION__))
;
28167 return Lower512IntUnary(Op, DAG);
28168 }
28169
28170 unsigned NumElts = VT.getVectorNumElements();
28171 assert(VT.getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28172, __PRETTY_FUNCTION__))
28172 "Only byte vector BITREVERSE supported")((VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28172, __PRETTY_FUNCTION__))
;
28173
28174 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
28175 if (VT.is256BitVector() && !Subtarget.hasInt256())
28176 return Lower256IntUnary(Op, DAG);
28177
28178 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
28179 // two nibbles and a PSHUFB lookup to find the bitreverse of each
28180 // 0-15 value (moved to the other nibble).
28181 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
28182 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
28183 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
28184
28185 const int LoLUT[16] = {
28186 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
28187 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
28188 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
28189 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
28190 const int HiLUT[16] = {
28191 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
28192 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
28193 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
28194 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
28195
28196 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
28197 for (unsigned i = 0; i < NumElts; ++i) {
28198 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
28199 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
28200 }
28201
28202 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
28203 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
28204 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
28205 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
28206 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
28207}
28208
28209static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
28210 const X86Subtarget &Subtarget) {
28211 unsigned NewOpc = 0;
28212 switch (N->getOpcode()) {
28213 case ISD::ATOMIC_LOAD_ADD:
28214 NewOpc = X86ISD::LADD;
28215 break;
28216 case ISD::ATOMIC_LOAD_SUB:
28217 NewOpc = X86ISD::LSUB;
28218 break;
28219 case ISD::ATOMIC_LOAD_OR:
28220 NewOpc = X86ISD::LOR;
28221 break;
28222 case ISD::ATOMIC_LOAD_XOR:
28223 NewOpc = X86ISD::LXOR;
28224 break;
28225 case ISD::ATOMIC_LOAD_AND:
28226 NewOpc = X86ISD::LAND;
28227 break;
28228 default:
28229 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28229)
;
28230 }
28231
28232 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
28233
28234 return DAG.getMemIntrinsicNode(
28235 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
28236 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
28237 /*MemVT=*/N->getSimpleValueType(0), MMO);
28238}
28239
28240/// Lower atomic_load_ops into LOCK-prefixed operations.
28241static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
28242 const X86Subtarget &Subtarget) {
28243 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
28244 SDValue Chain = N->getOperand(0);
28245 SDValue LHS = N->getOperand(1);
28246 SDValue RHS = N->getOperand(2);
28247 unsigned Opc = N->getOpcode();
28248 MVT VT = N->getSimpleValueType(0);
28249 SDLoc DL(N);
28250
28251 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
28252 // can only be lowered when the result is unused. They should have already
28253 // been transformed into a cmpxchg loop in AtomicExpand.
28254 if (N->hasAnyUseOfValue(0)) {
28255 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
28256 // select LXADD if LOCK_SUB can't be selected.
28257 if (Opc == ISD::ATOMIC_LOAD_SUB) {
28258 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
28259 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
28260 RHS, AN->getMemOperand());
28261 }
28262 assert(Opc == ISD::ATOMIC_LOAD_ADD &&((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28263, __PRETTY_FUNCTION__))
28263 "Used AtomicRMW ops other than Add should have been expanded!")((Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!"
) ? static_cast<void> (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28263, __PRETTY_FUNCTION__))
;
28264 return N;
28265 }
28266
28267 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
28268 // The core idea here is that since the memory location isn't actually
28269 // changing, all we need is a lowering for the *ordering* impacts of the
28270 // atomicrmw. As such, we can chose a different operation and memory
28271 // location to minimize impact on other code.
28272 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {
28273 // On X86, the only ordering which actually requires an instruction is
28274 // seq_cst which isn't SingleThread, everything just needs to be preserved
28275 // during codegen and then dropped. Note that we expect (but don't assume),
28276 // that orderings other than seq_cst and acq_rel have been canonicalized to
28277 // a store or load.
28278 if (AN->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
28279 AN->getSyncScopeID() == SyncScope::System) {
28280 // Prefer a locked operation against a stack location to minimize cache
28281 // traffic. This assumes that stack locations are very likely to be
28282 // accessed only by the owning thread.
28283 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
28284 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28284, __PRETTY_FUNCTION__))
;
28285 // NOTE: The getUNDEF is needed to give something for the unused result 0.
28286 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
28287 DAG.getUNDEF(VT), NewChain);
28288 }
28289 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
28290 SDValue NewChain = DAG.getNode(X86ISD::MEMBARRIER, DL, MVT::Other, Chain);
28291 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28291, __PRETTY_FUNCTION__))
;
28292 // NOTE: The getUNDEF is needed to give something for the unused result 0.
28293 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
28294 DAG.getUNDEF(VT), NewChain);
28295 }
28296
28297 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
28298 // RAUW the chain, but don't worry about the result, as it's unused.
28299 assert(!N->hasAnyUseOfValue(0))((!N->hasAnyUseOfValue(0)) ? static_cast<void> (0) :
__assert_fail ("!N->hasAnyUseOfValue(0)", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28299, __PRETTY_FUNCTION__))
;
28300 // NOTE: The getUNDEF is needed to give something for the unused result 0.
28301 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
28302 DAG.getUNDEF(VT), LockOp.getValue(1));
28303}
28304
28305static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
28306 const X86Subtarget &Subtarget) {
28307 auto *Node = cast<AtomicSDNode>(Op.getNode());
28308 SDLoc dl(Node);
28309 EVT VT = Node->getMemoryVT();
28310
28311 bool IsSeqCst = Node->getOrdering() == AtomicOrdering::SequentiallyConsistent;
28312 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
28313
28314 // If this store is not sequentially consistent and the type is legal
28315 // we can just keep it.
28316 if (!IsSeqCst && IsTypeLegal)
28317 return Op;
28318
28319 if (VT == MVT::i64 && !IsTypeLegal) {
28320 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
28321 // is enabled.
28322 bool NoImplicitFloatOps =
28323 DAG.getMachineFunction().getFunction().hasFnAttribute(
28324 Attribute::NoImplicitFloat);
28325 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
28326 SDValue Chain;
28327 if (Subtarget.hasSSE1()) {
28328 SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
28329 Node->getOperand(2));
28330 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
28331 SclToVec = DAG.getBitcast(StVT, SclToVec);
28332 SDVTList Tys = DAG.getVTList(MVT::Other);
28333 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
28334 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
28335 MVT::i64, Node->getMemOperand());
28336 } else if (Subtarget.hasX87()) {
28337 // First load this into an 80-bit X87 register using a stack temporary.
28338 // This will put the whole integer into the significand.
28339 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
28340 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28341 MachinePointerInfo MPI =
28342 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
28343 Chain =
28344 DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
28345 MPI, /*Align*/ 0, MachineMemOperand::MOStore);
28346 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
28347 SDValue LdOps[] = {Chain, StackPtr};
28348 SDValue Value =
28349 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
28350 /*Align*/ 0, MachineMemOperand::MOLoad);
28351 Chain = Value.getValue(1);
28352
28353 // Now use an FIST to do the atomic store.
28354 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
28355 Chain =
28356 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
28357 StoreOps, MVT::i64, Node->getMemOperand());
28358 }
28359
28360 if (Chain) {
28361 // If this is a sequentially consistent store, also emit an appropriate
28362 // barrier.
28363 if (IsSeqCst)
28364 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
28365
28366 return Chain;
28367 }
28368 }
28369 }
28370
28371 // Convert seq_cst store -> xchg
28372 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
28373 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
28374 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
28375 Node->getMemoryVT(),
28376 Node->getOperand(0),
28377 Node->getOperand(1), Node->getOperand(2),
28378 Node->getMemOperand());
28379 return Swap.getValue(1);
28380}
28381
28382static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
28383 SDNode *N = Op.getNode();
28384 MVT VT = N->getSimpleValueType(0);
28385
28386 // Let legalize expand this if it isn't a legal type yet.
28387 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
28388 return SDValue();
28389
28390 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28391 SDLoc DL(N);
28392
28393 // Set the carry flag.
28394 SDValue Carry = Op.getOperand(2);
28395 EVT CarryVT = Carry.getValueType();
28396 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
28397 Carry, DAG.getAllOnesConstant(DL, CarryVT));
28398
28399 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
28400 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
28401 Op.getOperand(1), Carry.getValue(1));
28402
28403 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
28404 if (N->getValueType(1) == MVT::i1)
28405 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
28406
28407 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
28408}
28409
28410static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
28411 SelectionDAG &DAG) {
28412 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())((Subtarget.isTargetDarwin() && Subtarget.is64Bit()) ?
static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28412, __PRETTY_FUNCTION__))
;
28413
28414 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
28415 // which returns the values as { float, float } (in XMM0) or
28416 // { double, double } (which is returned in XMM0, XMM1).
28417 SDLoc dl(Op);
28418 SDValue Arg = Op.getOperand(0);
28419 EVT ArgVT = Arg.getValueType();
28420 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28421
28422 TargetLowering::ArgListTy Args;
28423 TargetLowering::ArgListEntry Entry;
28424
28425 Entry.Node = Arg;
28426 Entry.Ty = ArgTy;
28427 Entry.IsSExt = false;
28428 Entry.IsZExt = false;
28429 Args.push_back(Entry);
28430
28431 bool isF64 = ArgVT == MVT::f64;
28432 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
28433 // the small struct {f32, f32} is returned in (eax, edx). For f64,
28434 // the results are returned via SRet in memory.
28435 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28436 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
28437 const char *LibcallName = TLI.getLibcallName(LC);
28438 SDValue Callee =
28439 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
28440
28441 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
28442 : (Type *)VectorType::get(ArgTy, 4);
28443
28444 TargetLowering::CallLoweringInfo CLI(DAG);
28445 CLI.setDebugLoc(dl)
28446 .setChain(DAG.getEntryNode())
28447 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
28448
28449 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
28450
28451 if (isF64)
28452 // Returned in xmm0 and xmm1.
28453 return CallResult.first;
28454
28455 // Returned in bits 0:31 and 32:64 xmm0.
28456 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
28457 CallResult.first, DAG.getIntPtrConstant(0, dl));
28458 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
28459 CallResult.first, DAG.getIntPtrConstant(1, dl));
28460 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
28461 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
28462}
28463
28464/// Widen a vector input to a vector of NVT. The
28465/// input vector must have the same element type as NVT.
28466static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
28467 bool FillWithZeroes = false) {
28468 // Check if InOp already has the right width.
28469 MVT InVT = InOp.getSimpleValueType();
28470 if (InVT == NVT)
28471 return InOp;
28472
28473 if (InOp.isUndef())
28474 return DAG.getUNDEF(NVT);
28475
28476 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&((InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match") ? static_cast<
void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28477, __PRETTY_FUNCTION__))
28477 "input and widen element type must match")((InVT.getVectorElementType() == NVT.getVectorElementType() &&
"input and widen element type must match") ? static_cast<
void> (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28477, __PRETTY_FUNCTION__))
;
28478
28479 unsigned InNumElts = InVT.getVectorNumElements();
28480 unsigned WidenNumElts = NVT.getVectorNumElements();
28481 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&((WidenNumElts > InNumElts && WidenNumElts % InNumElts
== 0 && "Unexpected request for vector widening") ? static_cast
<void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28482, __PRETTY_FUNCTION__))
28482 "Unexpected request for vector widening")((WidenNumElts > InNumElts && WidenNumElts % InNumElts
== 0 && "Unexpected request for vector widening") ? static_cast
<void> (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28482, __PRETTY_FUNCTION__))
;
28483
28484 SDLoc dl(InOp);
28485 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
28486 InOp.getNumOperands() == 2) {
28487 SDValue N1 = InOp.getOperand(1);
28488 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
28489 N1.isUndef()) {
28490 InOp = InOp.getOperand(0);
28491 InVT = InOp.getSimpleValueType();
28492 InNumElts = InVT.getVectorNumElements();
28493 }
28494 }
28495 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
28496 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
28497 SmallVector<SDValue, 16> Ops;
28498 for (unsigned i = 0; i < InNumElts; ++i)
28499 Ops.push_back(InOp.getOperand(i));
28500
28501 EVT EltVT = InOp.getOperand(0).getValueType();
28502
28503 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
28504 DAG.getUNDEF(EltVT);
28505 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
28506 Ops.push_back(FillVal);
28507 return DAG.getBuildVector(NVT, dl, Ops);
28508 }
28509 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
28510 DAG.getUNDEF(NVT);
28511 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
28512 InOp, DAG.getIntPtrConstant(0, dl));
28513}
28514
28515static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
28516 SelectionDAG &DAG) {
28517 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28518, __PRETTY_FUNCTION__))
28518 "MGATHER/MSCATTER are supported on AVX-512 arch only")((Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28518, __PRETTY_FUNCTION__))
;
28519
28520 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
28521 SDValue Src = N->getValue();
28522 MVT VT = Src.getSimpleValueType();
28523 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")((VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28523, __PRETTY_FUNCTION__))
;
28524 SDLoc dl(Op);
28525
28526 SDValue Scale = N->getScale();
28527 SDValue Index = N->getIndex();
28528 SDValue Mask = N->getMask();
28529 SDValue Chain = N->getChain();
28530 SDValue BasePtr = N->getBasePtr();
28531
28532 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
28533 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"
) ? static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28533, __PRETTY_FUNCTION__))
;
28534 // If the index is v2i64 and we have VLX we can use xmm for data and index.
28535 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
28536 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28537 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
28538 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
28539 SDVTList VTs = DAG.getVTList(MVT::Other);
28540 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
28541 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
28542 N->getMemoryVT(), N->getMemOperand());
28543 }
28544 return SDValue();
28545 }
28546
28547 MVT IndexVT = Index.getSimpleValueType();
28548
28549 // If the index is v2i32, we're being called by type legalization and we
28550 // should just let the default handling take care of it.
28551 if (IndexVT == MVT::v2i32)
28552 return SDValue();
28553
28554 // If we don't have VLX and neither the passthru or index is 512-bits, we
28555 // need to widen until one is.
28556 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
28557 !Index.getSimpleValueType().is512BitVector()) {
28558 // Determine how much we need to widen by to get a 512-bit type.
28559 unsigned Factor = std::min(512/VT.getSizeInBits(),
28560 512/IndexVT.getSizeInBits());
28561 unsigned NumElts = VT.getVectorNumElements() * Factor;
28562
28563 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
28564 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
28565 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
28566
28567 Src = ExtendToType(Src, VT, DAG);
28568 Index = ExtendToType(Index, IndexVT, DAG);
28569 Mask = ExtendToType(Mask, MaskVT, DAG, true);
28570 }
28571
28572 SDVTList VTs = DAG.getVTList(MVT::Other);
28573 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
28574 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
28575 N->getMemoryVT(), N->getMemOperand());
28576}
28577
28578static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
28579 SelectionDAG &DAG) {
28580
28581 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
28582 MVT VT = Op.getSimpleValueType();
28583 MVT ScalarVT = VT.getScalarType();
28584 SDValue Mask = N->getMask();
28585 MVT MaskVT = Mask.getSimpleValueType();
28586 SDValue PassThru = N->getPassThru();
28587 SDLoc dl(Op);
28588
28589 // Handle AVX masked loads which don't support passthru other than 0.
28590 if (MaskVT.getVectorElementType() != MVT::i1) {
28591 // We also allow undef in the isel pattern.
28592 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
28593 return Op;
28594
28595 SDValue NewLoad = DAG.getMaskedLoad(
28596 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
28597 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
28598 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
28599 N->isExpandingLoad());
28600 // Emit a blend.
28601 SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
28602 PassThru);
28603 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
28604 }
28605
28606 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28607, __PRETTY_FUNCTION__))
28607 "Expanding masked load is supported on AVX-512 target only!")(((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28607, __PRETTY_FUNCTION__))
;
28608
28609 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28610, __PRETTY_FUNCTION__))
28610 "Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28610, __PRETTY_FUNCTION__))
;
28611
28612 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked load op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28613, __PRETTY_FUNCTION__))
28613 "Cannot lower masked load op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked load op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28613, __PRETTY_FUNCTION__))
;
28614
28615 assert((ScalarVT.getSizeInBits() >= 32 ||(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28618, __PRETTY_FUNCTION__))
28616 (Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28618, __PRETTY_FUNCTION__))
28617 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28618, __PRETTY_FUNCTION__))
28618 "Unsupported masked load op.")(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28618, __PRETTY_FUNCTION__))
;
28619
28620 // This operation is legal for targets with VLX, but without
28621 // VLX the vector should be widened to 512 bit
28622 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
28623 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
28624 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
28625
28626 // Mask element has to be i1.
28627 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28628, __PRETTY_FUNCTION__))
28628 "Unexpected mask type")((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28628, __PRETTY_FUNCTION__))
;
28629
28630 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
28631
28632 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
28633 SDValue NewLoad = DAG.getMaskedLoad(
28634 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
28635 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
28636 N->getExtensionType(), N->isExpandingLoad());
28637
28638 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
28639 NewLoad.getValue(0),
28640 DAG.getIntPtrConstant(0, dl));
28641 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
28642 return DAG.getMergeValues(RetOps, dl);
28643}
28644
28645static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
28646 SelectionDAG &DAG) {
28647 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
28648 SDValue DataToStore = N->getValue();
28649 MVT VT = DataToStore.getSimpleValueType();
28650 MVT ScalarVT = VT.getScalarType();
28651 SDValue Mask = N->getMask();
28652 SDLoc dl(Op);
28653
28654 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28655, __PRETTY_FUNCTION__))
28655 "Expanding masked load is supported on AVX-512 target only!")(((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
"Expanding masked load is supported on AVX-512 target only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28655, __PRETTY_FUNCTION__))
;
28656
28657 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(((!N->isCompressingStore() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28658, __PRETTY_FUNCTION__))
28658 "Expanding masked load is supported for 32 and 64-bit types only!")(((!N->isCompressingStore() || ScalarVT.getSizeInBits() >=
32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? static_cast<void> (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28658, __PRETTY_FUNCTION__))
;
28659
28660 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked store op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28661, __PRETTY_FUNCTION__))
28661 "Cannot lower masked store op.")((Subtarget.hasAVX512() && !Subtarget.hasVLX() &&
!VT.is512BitVector() && "Cannot lower masked store op."
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28661, __PRETTY_FUNCTION__))
;
28662
28663 assert((ScalarVT.getSizeInBits() >= 32 ||(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28666, __PRETTY_FUNCTION__))
28664 (Subtarget.hasBWI() &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28666, __PRETTY_FUNCTION__))
28665 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28666, __PRETTY_FUNCTION__))
28666 "Unsupported masked store op.")(((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() &&
(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op."
) ? static_cast<void> (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28666, __PRETTY_FUNCTION__))
;
28667
28668 // This operation is legal for targets with VLX, but without
28669 // VLX the vector should be widened to 512 bit
28670 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
28671 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
28672
28673 // Mask element has to be i1.
28674 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28675, __PRETTY_FUNCTION__))
28675 "Unexpected mask type")((Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
"Unexpected mask type") ? static_cast<void> (0) : __assert_fail
("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28675, __PRETTY_FUNCTION__))
;
28676
28677 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
28678
28679 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
28680 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
28681 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
28682 N->getOffset(), Mask, N->getMemoryVT(),
28683 N->getMemOperand(), N->getAddressingMode(),
28684 N->isTruncatingStore(), N->isCompressingStore());
28685}
28686
28687static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
28688 SelectionDAG &DAG) {
28689 assert(Subtarget.hasAVX2() &&((Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28690, __PRETTY_FUNCTION__))
28690 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")((Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28690, __PRETTY_FUNCTION__))
;
28691
28692 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
28693 SDLoc dl(Op);
28694 MVT VT = Op.getSimpleValueType();
28695 SDValue Index = N->getIndex();
28696 SDValue Mask = N->getMask();
28697 SDValue PassThru = N->getPassThru();
28698 MVT IndexVT = Index.getSimpleValueType();
28699
28700 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")((VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28700, __PRETTY_FUNCTION__))
;
28701
28702 // If the index is v2i32, we're being called by type legalization.
28703 if (IndexVT == MVT::v2i32)
28704 return SDValue();
28705
28706 // If we don't have VLX and neither the passthru or index is 512-bits, we
28707 // need to widen until one is.
28708 MVT OrigVT = VT;
28709 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
28710 !IndexVT.is512BitVector()) {
28711 // Determine how much we need to widen by to get a 512-bit type.
28712 unsigned Factor = std::min(512/VT.getSizeInBits(),
28713 512/IndexVT.getSizeInBits());
28714
28715 unsigned NumElts = VT.getVectorNumElements() * Factor;
28716
28717 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
28718 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
28719 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
28720
28721 PassThru = ExtendToType(PassThru, VT, DAG);
28722 Index = ExtendToType(Index, IndexVT, DAG);
28723 Mask = ExtendToType(Mask, MaskVT, DAG, true);
28724 }
28725
28726 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
28727 N->getScale() };
28728 SDValue NewGather = DAG.getMemIntrinsicNode(
28729 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
28730 N->getMemOperand());
28731 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
28732 NewGather, DAG.getIntPtrConstant(0, dl));
28733 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
28734}
28735
28736static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
28737 SDLoc dl(Op);
28738 SDValue Src = Op.getOperand(0);
28739 MVT DstVT = Op.getSimpleValueType();
28740
28741 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
28742 unsigned SrcAS = N->getSrcAddressSpace();
28743
28744 assert(SrcAS != N->getDestAddressSpace() &&((SrcAS != N->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28745, __PRETTY_FUNCTION__))
28745 "addrspacecast must be between different address spaces")((SrcAS != N->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28745, __PRETTY_FUNCTION__))
;
28746
28747 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
28748 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
28749 } else if (DstVT == MVT::i64) {
28750 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
28751 } else if (DstVT == MVT::i32) {
28752 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
28753 } else {
28754 report_fatal_error("Bad address space in addrspacecast");
28755 }
28756 return Op;
28757}
28758
28759SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
28760 SelectionDAG &DAG) const {
28761 // TODO: Eventually, the lowering of these nodes should be informed by or
28762 // deferred to the GC strategy for the function in which they appear. For
28763 // now, however, they must be lowered to something. Since they are logically
28764 // no-ops in the case of a null GC strategy (or a GC strategy which does not
28765 // require special handling for these nodes), lower them as literal NOOPs for
28766 // the time being.
28767 SmallVector<SDValue, 2> Ops;
28768
28769 Ops.push_back(Op.getOperand(0));
28770 if (Op->getGluedNode())
28771 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
28772
28773 SDLoc OpDL(Op);
28774 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
28775 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
28776
28777 return NOOP;
28778}
28779
28780SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
28781 RTLIB::Libcall Call) const {
28782
28783 bool IsStrict = Op->isStrictFPOpcode();
28784 unsigned Offset = IsStrict ? 1 : 0;
28785 SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
28786
28787 SDLoc dl(Op);
28788 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
28789 MakeLibCallOptions CallOptions;
28790 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
28791 CallOptions, dl, Chain);
28792
28793 if (IsStrict)
28794 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
28795
28796 return Tmp.first;
28797}
28798
28799// Custom split CVTPS2PH with wide types.
28800static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
28801 SDLoc dl(Op);
28802 EVT VT = Op.getValueType();
28803 SDValue Lo, Hi;
28804 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
28805 EVT LoVT, HiVT;
28806 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
28807 SDValue RC = Op.getOperand(1);
28808 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
28809 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
28810 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28811}
28812
28813/// Provide custom lowering hooks for some operations.
28814SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
28815 switch (Op.getOpcode()) {
28816 default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28816)
;
28817 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
28818 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
28819 return LowerCMP_SWAP(Op, Subtarget, DAG);
28820 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
28821 case ISD::ATOMIC_LOAD_ADD:
28822 case ISD::ATOMIC_LOAD_SUB:
28823 case ISD::ATOMIC_LOAD_OR:
28824 case ISD::ATOMIC_LOAD_XOR:
28825 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
28826 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
28827 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
28828 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
28829 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
28830 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
28831 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
28832 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
28833 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
28834 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
28835 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
28836 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
28837 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
28838 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
28839 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
28840 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
28841 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
28842 case ISD::SHL_PARTS:
28843 case ISD::SRA_PARTS:
28844 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
28845 case ISD::FSHL:
28846 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
28847 case ISD::STRICT_SINT_TO_FP:
28848 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
28849 case ISD::STRICT_UINT_TO_FP:
28850 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
28851 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
28852 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
28853 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
28854 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
28855 case ISD::ZERO_EXTEND_VECTOR_INREG:
28856 case ISD::SIGN_EXTEND_VECTOR_INREG:
28857 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
28858 case ISD::FP_TO_SINT:
28859 case ISD::STRICT_FP_TO_SINT:
28860 case ISD::FP_TO_UINT:
28861 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
28862 case ISD::FP_EXTEND:
28863 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
28864 case ISD::FP_ROUND:
28865 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
28866 case ISD::FP16_TO_FP:
28867 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
28868 case ISD::FP_TO_FP16:
28869 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
28870 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
28871 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
28872 case ISD::FADD:
28873 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
28874 case ISD::FROUND: return LowerFROUND(Op, DAG);
28875 case ISD::FABS:
28876 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
28877 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
28878 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
28879 case ISD::LRINT:
28880 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
28881 case ISD::SETCC:
28882 case ISD::STRICT_FSETCC:
28883 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
28884 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
28885 case ISD::SELECT: return LowerSELECT(Op, DAG);
28886 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
28887 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
28888 case ISD::VASTART: return LowerVASTART(Op, DAG);
28889 case ISD::VAARG: return LowerVAARG(Op, DAG);
28890 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
28891 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
28892 case ISD::INTRINSIC_VOID:
28893 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
28894 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
28895 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
28896 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
28897 case ISD::FRAME_TO_ARGS_OFFSET:
28898 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
28899 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
28900 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
28901 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
28902 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
28903 case ISD::EH_SJLJ_SETUP_DISPATCH:
28904 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
28905 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
28906 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
28907 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
28908 case ISD::CTLZ:
28909 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
28910 case ISD::CTTZ:
28911 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
28912 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
28913 case ISD::MULHS:
28914 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
28915 case ISD::ROTL:
28916 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
28917 case ISD::SRA:
28918 case ISD::SRL:
28919 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
28920 case ISD::SADDO:
28921 case ISD::UADDO:
28922 case ISD::SSUBO:
28923 case ISD::USUBO:
28924 case ISD::SMULO:
28925 case ISD::UMULO: return LowerXALUO(Op, DAG);
28926 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
28927 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
28928 case ISD::ADDCARRY:
28929 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
28930 case ISD::ADD:
28931 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
28932 case ISD::UADDSAT:
28933 case ISD::SADDSAT:
28934 case ISD::USUBSAT:
28935 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
28936 case ISD::SMAX:
28937 case ISD::SMIN:
28938 case ISD::UMAX:
28939 case ISD::UMIN: return LowerMINMAX(Op, DAG);
28940 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
28941 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
28942 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
28943 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
28944 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
28945 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
28946 case ISD::GC_TRANSITION_START:
28947 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
28948 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
28949 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
28950 }
28951}
28952
28953/// Places new result values for the node in Results (their number
28954/// and types must exactly match those of the original return values of
28955/// the node), or leaves Results empty, which indicates that the node is not
28956/// to be custom lowered after all.
28957void X86TargetLowering::LowerOperationWrapper(SDNode *N,
28958 SmallVectorImpl<SDValue> &Results,
28959 SelectionDAG &DAG) const {
28960 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
28961
28962 if (!Res.getNode())
28963 return;
28964
28965 // If the original node has one result, take the return value from
28966 // LowerOperation as is. It might not be result number 0.
28967 if (N->getNumValues() == 1) {
28968 Results.push_back(Res);
28969 return;
28970 }
28971
28972 // If the original node has multiple results, then the return node should
28973 // have the same number of results.
28974 assert((N->getNumValues() == Res->getNumValues()) &&(((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!") ? static_cast
<void> (0) : __assert_fail ("(N->getNumValues() == Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28975, __PRETTY_FUNCTION__))
28975 "Lowering returned the wrong number of results!")(((N->getNumValues() == Res->getNumValues()) &&
"Lowering returned the wrong number of results!") ? static_cast
<void> (0) : __assert_fail ("(N->getNumValues() == Res->getNumValues()) && \"Lowering returned the wrong number of results!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28975, __PRETTY_FUNCTION__))
;
28976
28977 // Places new result values base on N result number.
28978 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
28979 Results.push_back(Res.getValue(I));
28980}
28981
28982/// Replace a node with an illegal result type with a new node built out of
28983/// custom code.
28984void X86TargetLowering::ReplaceNodeResults(SDNode *N,
28985 SmallVectorImpl<SDValue>&Results,
28986 SelectionDAG &DAG) const {
28987 SDLoc dl(N);
28988 switch (N->getOpcode()) {
28989 default:
28990#ifndef NDEBUG
28991 dbgs() << "ReplaceNodeResults: ";
28992 N->dump(&DAG);
28993#endif
28994 llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28994)
;
28995 case X86ISD::CVTPH2PS: {
28996 EVT VT = N->getValueType(0);
28997 SDValue Lo, Hi;
28998 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
28999 EVT LoVT, HiVT;
29000 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
29001 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
29002 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
29003 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29004 Results.push_back(Res);
29005 return;
29006 }
29007 case X86ISD::STRICT_CVTPH2PS: {
29008 EVT VT = N->getValueType(0);
29009 SDValue Lo, Hi;
29010 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
29011 EVT LoVT, HiVT;
29012 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
29013 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
29014 {N->getOperand(0), Lo});
29015 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
29016 {N->getOperand(0), Hi});
29017 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
29018 Lo.getValue(1), Hi.getValue(1));
29019 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29020 Results.push_back(Res);
29021 Results.push_back(Chain);
29022 return;
29023 }
29024 case ISD::CTPOP: {
29025 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29025, __PRETTY_FUNCTION__))
;
29026 // Use a v2i64 if possible.
29027 bool NoImplicitFloatOps =
29028 DAG.getMachineFunction().getFunction().hasFnAttribute(
29029 Attribute::NoImplicitFloat);
29030 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
29031 SDValue Wide =
29032 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
29033 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
29034 // Bit count should fit in 32-bits, extract it as that and then zero
29035 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
29036 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
29037 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
29038 DAG.getIntPtrConstant(0, dl));
29039 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
29040 Results.push_back(Wide);
29041 }
29042 return;
29043 }
29044 case ISD::MUL: {
29045 EVT VT = N->getValueType(0);
29046 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29047, __PRETTY_FUNCTION__))
29047 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
VT.getVectorElementType() == MVT::i8 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29047, __PRETTY_FUNCTION__))
;
29048 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
29049 // elements are needed.
29050 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29051 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
29052 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
29053 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
29054 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
29055 unsigned NumConcats = 16 / VT.getVectorNumElements();
29056 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
29057 ConcatOps[0] = Res;
29058 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
29059 Results.push_back(Res);
29060 return;
29061 }
29062 case X86ISD::VPMADDWD:
29063 case X86ISD::AVG: {
29064 // Legalize types for X86ISD::AVG/VPMADDWD by widening.
29065 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29065, __PRETTY_FUNCTION__))
;
29066
29067 EVT VT = N->getValueType(0);
29068 EVT InVT = N->getOperand(0).getValueType();
29069 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&((VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits
() == 0 && "Expected a VT that divides into 128 bits."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29070, __PRETTY_FUNCTION__))
29070 "Expected a VT that divides into 128 bits.")((VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits
() == 0 && "Expected a VT that divides into 128 bits."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29070, __PRETTY_FUNCTION__))
;
29071 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29072, __PRETTY_FUNCTION__))
29072 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29072, __PRETTY_FUNCTION__))
;
29073 unsigned NumConcat = 128 / InVT.getSizeInBits();
29074
29075 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
29076 InVT.getVectorElementType(),
29077 NumConcat * InVT.getVectorNumElements());
29078 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
29079 VT.getVectorElementType(),
29080 NumConcat * VT.getVectorNumElements());
29081
29082 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
29083 Ops[0] = N->getOperand(0);
29084 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
29085 Ops[0] = N->getOperand(1);
29086 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
29087
29088 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
29089 Results.push_back(Res);
29090 return;
29091 }
29092 case ISD::ABS: {
29093 assert(N->getValueType(0) == MVT::i64 &&((N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected type (!= i64) on ABS.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29094, __PRETTY_FUNCTION__))
29094 "Unexpected type (!= i64) on ABS.")((N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected type (!= i64) on ABS.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29094, __PRETTY_FUNCTION__))
;
29095 MVT HalfT = MVT::i32;
29096 SDValue Lo, Hi, Tmp;
29097 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1);
29098
29099 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
29100 DAG.getConstant(0, dl, HalfT));
29101 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0),
29102 DAG.getConstant(1, dl, HalfT));
29103 Tmp = DAG.getNode(
29104 ISD::SRA, dl, HalfT, Hi,
29105 DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));
29106 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
29107 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
29108 SDValue(Lo.getNode(), 1));
29109 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
29110 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
29111 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));
29112 return;
29113 }
29114 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
29115 case X86ISD::FMINC:
29116 case X86ISD::FMIN:
29117 case X86ISD::FMAXC:
29118 case X86ISD::FMAX: {
29119 EVT VT = N->getValueType(0);
29120 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")((VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29120, __PRETTY_FUNCTION__))
;
29121 SDValue UNDEF = DAG.getUNDEF(VT);
29122 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
29123 N->getOperand(0), UNDEF);
29124 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
29125 N->getOperand(1), UNDEF);
29126 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
29127 return;
29128 }
29129 case ISD::SDIV:
29130 case ISD::UDIV:
29131 case ISD::SREM:
29132 case ISD::UREM: {
29133 EVT VT = N->getValueType(0);
29134 if (VT.isVector()) {
29135 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29136, __PRETTY_FUNCTION__))
29136 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29136, __PRETTY_FUNCTION__))
;
29137 // If this RHS is a constant splat vector we can widen this and let
29138 // division/remainder by constant optimize it.
29139 // TODO: Can we do something for non-splat?
29140 APInt SplatVal;
29141 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
29142 unsigned NumConcats = 128 / VT.getSizeInBits();
29143 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
29144 Ops0[0] = N->getOperand(0);
29145 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
29146 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
29147 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
29148 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
29149 Results.push_back(Res);
29150 }
29151 return;
29152 }
29153
29154 LLVM_FALLTHROUGH[[gnu::fallthrough]];
29155 }
29156 case ISD::SDIVREM:
29157 case ISD::UDIVREM: {
29158 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
29159 Results.push_back(V);
29160 return;
29161 }
29162 case ISD::TRUNCATE: {
29163 MVT VT = N->getSimpleValueType(0);
29164 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
29165 return;
29166
29167 // The generic legalizer will try to widen the input type to the same
29168 // number of elements as the widened result type. But this isn't always
29169 // the best thing so do some custom legalization to avoid some cases.
29170 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
29171 SDValue In = N->getOperand(0);
29172 EVT InVT = In.getValueType();
29173
29174 unsigned InBits = InVT.getSizeInBits();
29175 if (128 % InBits == 0) {
29176 // 128 bit and smaller inputs should avoid truncate all together and
29177 // just use a build_vector that will become a shuffle.
29178 // TODO: Widen and use a shuffle directly?
29179 MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
29180 EVT EltVT = VT.getVectorElementType();
29181 unsigned WidenNumElts = WidenVT.getVectorNumElements();
29182 SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
29183 // Use the original element count so we don't do more scalar opts than
29184 // necessary.
29185 unsigned MinElts = VT.getVectorNumElements();
29186 for (unsigned i=0; i < MinElts; ++i) {
29187 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
29188 DAG.getIntPtrConstant(i, dl));
29189 Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
29190 }
29191 Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
29192 return;
29193 }
29194 // With AVX512 there are some cases that can use a target specific
29195 // truncate node to go from 256/512 to less than 128 with zeros in the
29196 // upper elements of the 128 bit result.
29197 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
29198 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
29199 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
29200 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
29201 return;
29202 }
29203 // There's one case we can widen to 512 bits and use VTRUNC.
29204 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
29205 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
29206 DAG.getUNDEF(MVT::v4i64));
29207 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
29208 return;
29209 }
29210 }
29211 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
29212 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
29213 isTypeLegal(MVT::v4i64)) {
29214 // Input needs to be split and output needs to widened. Let's use two
29215 // VTRUNCs, and shuffle their results together into the wider type.
29216 SDValue Lo, Hi;
29217 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
29218
29219 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
29220 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
29221 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
29222 { 0, 1, 2, 3, 16, 17, 18, 19,
29223 -1, -1, -1, -1, -1, -1, -1, -1 });
29224 Results.push_back(Res);
29225 return;
29226 }
29227
29228 return;
29229 }
29230 case ISD::ANY_EXTEND:
29231 // Right now, only MVT::v8i8 has Custom action for an illegal type.
29232 // It's intended to custom handle the input type.
29233 assert(N->getValueType(0) == MVT::v8i8 &&((N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29234, __PRETTY_FUNCTION__))
29234 "Do not know how to legalize this Node")((N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29234, __PRETTY_FUNCTION__))
;
29235 return;
29236 case ISD::SIGN_EXTEND:
29237 case ISD::ZERO_EXTEND: {
29238 EVT VT = N->getValueType(0);
29239 SDValue In = N->getOperand(0);
29240 EVT InVT = In.getValueType();
29241 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
29242 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
29243 assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29244, __PRETTY_FUNCTION__))
29244 "Unexpected type action!")((getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29244, __PRETTY_FUNCTION__))
;
29245 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")((N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29245, __PRETTY_FUNCTION__))
;
29246 // Custom split this so we can extend i8/i16->i32 invec. This is better
29247 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
29248 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
29249 // we allow the sra from the extend to i32 to be shared by the split.
29250 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
29251
29252 // Fill a vector with sign bits for each element.
29253 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
29254 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
29255
29256 // Create an unpackl and unpackh to interleave the sign bits then bitcast
29257 // to v2i64.
29258 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
29259 {0, 4, 1, 5});
29260 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
29261 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
29262 {2, 6, 3, 7});
29263 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
29264
29265 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29266 Results.push_back(Res);
29267 return;
29268 }
29269
29270 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
29271 if (!InVT.is128BitVector()) {
29272 // Not a 128 bit vector, but maybe type legalization will promote
29273 // it to 128 bits.
29274 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
29275 return;
29276 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
29277 if (!InVT.is128BitVector())
29278 return;
29279
29280 // Promote the input to 128 bits. Type legalization will turn this into
29281 // zext_inreg/sext_inreg.
29282 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
29283 }
29284
29285 // Perform custom splitting instead of the two stage extend we would get
29286 // by default.
29287 EVT LoVT, HiVT;
29288 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
29289 assert(isTypeLegal(LoVT) && "Split VT not legal?")((isTypeLegal(LoVT) && "Split VT not legal?") ? static_cast
<void> (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29289, __PRETTY_FUNCTION__))
;
29290
29291 SDValue Lo = getExtendInVec(N->getOpcode(), dl, LoVT, In, DAG);
29292
29293 // We need to shift the input over by half the number of elements.
29294 unsigned NumElts = InVT.getVectorNumElements();
29295 unsigned HalfNumElts = NumElts / 2;
29296 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
29297 for (unsigned i = 0; i != HalfNumElts; ++i)
29298 ShufMask[i] = i + HalfNumElts;
29299
29300 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
29301 Hi = getExtendInVec(N->getOpcode(), dl, HiVT, Hi, DAG);
29302
29303 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29304 Results.push_back(Res);
29305 }
29306 return;
29307 }
29308 case ISD::FP_TO_SINT:
29309 case ISD::STRICT_FP_TO_SINT:
29310 case ISD::FP_TO_UINT:
29311 case ISD::STRICT_FP_TO_UINT: {
29312 bool IsStrict = N->isStrictFPOpcode();
29313 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
29314 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
29315 EVT VT = N->getValueType(0);
29316 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
29317 EVT SrcVT = Src.getValueType();
29318
29319 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
29320 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29321, __PRETTY_FUNCTION__))
29321 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29321, __PRETTY_FUNCTION__))
;
29322
29323 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
29324 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
29325 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
29326 VT.getVectorNumElements());
29327 SDValue Res;
29328 SDValue Chain;
29329 if (IsStrict) {
29330 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
29331 {N->getOperand(0), Src});
29332 Chain = Res.getValue(1);
29333 } else
29334 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
29335
29336 // Preserve what we know about the size of the original result. Except
29337 // when the result is v2i32 since we can't widen the assert.
29338 if (PromoteVT != MVT::v2i32)
29339 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
29340 dl, PromoteVT, Res,
29341 DAG.getValueType(VT.getVectorElementType()));
29342
29343 // Truncate back to the original width.
29344 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
29345
29346 // Now widen to 128 bits.
29347 unsigned NumConcats = 128 / VT.getSizeInBits();
29348 MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
29349 VT.getVectorNumElements() * NumConcats);
29350 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
29351 ConcatOps[0] = Res;
29352 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
29353 Results.push_back(Res);
29354 if (IsStrict)
29355 Results.push_back(Chain);
29356 return;
29357 }
29358
29359
29360 if (VT == MVT::v2i32) {
29361 assert((IsSigned || Subtarget.hasAVX512()) &&(((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"
) ? static_cast<void> (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29362, __PRETTY_FUNCTION__))
29362 "Can only handle signed conversion without AVX512")(((IsSigned || Subtarget.hasAVX512()) && "Can only handle signed conversion without AVX512"
) ? static_cast<void> (0) : __assert_fail ("(IsSigned || Subtarget.hasAVX512()) && \"Can only handle signed conversion without AVX512\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29362, __PRETTY_FUNCTION__))
;
29363 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29363, __PRETTY_FUNCTION__))
;
29364 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29365, __PRETTY_FUNCTION__))
29365 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29365, __PRETTY_FUNCTION__))
;
29366 if (Src.getValueType() == MVT::v2f64) {
29367 unsigned Opc;
29368 if (IsStrict)
29369 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
29370 else
29371 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
29372
29373 // If we have VLX we can emit a target specific FP_TO_UINT node,.
29374 if (!IsSigned && !Subtarget.hasVLX()) {
29375 // Otherwise we can defer to the generic legalizer which will widen
29376 // the input as well. This will be further widened during op
29377 // legalization to v8i32<-v8f64.
29378 // For strict nodes we'll need to widen ourselves.
29379 // FIXME: Fix the type legalizer to safely widen strict nodes?
29380 if (!IsStrict)
29381 return;
29382 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
29383 DAG.getConstantFP(0.0, dl, MVT::v2f64));
29384 Opc = N->getOpcode();
29385 }
29386 SDValue Res;
29387 SDValue Chain;
29388 if (IsStrict) {
29389 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
29390 {N->getOperand(0), Src});
29391 Chain = Res.getValue(1);
29392 } else {
29393 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
29394 }
29395 Results.push_back(Res);
29396 if (IsStrict)
29397 Results.push_back(Chain);
29398 return;
29399 }
29400
29401 // Custom widen strict v2f32->v2i32 by padding with zeros.
29402 // FIXME: Should generic type legalizer do this?
29403 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
29404 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
29405 DAG.getConstantFP(0.0, dl, MVT::v2f32));
29406 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
29407 {N->getOperand(0), Src});
29408 Results.push_back(Res);
29409 Results.push_back(Res.getValue(1));
29410 return;
29411 }
29412
29413 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
29414 // so early out here.
29415 return;
29416 }
29417
29418 assert(!VT.isVector() && "Vectors should have been handled above!")((!VT.isVector() && "Vectors should have been handled above!"
) ? static_cast<void> (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29418, __PRETTY_FUNCTION__))
;
29419
29420 if (Subtarget.hasDQI() && VT == MVT::i64 &&
29421 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
29422 assert(!Subtarget.is64Bit() && "i64 should be legal")((!Subtarget.is64Bit() && "i64 should be legal") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29422, __PRETTY_FUNCTION__))
;
29423 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
29424 // If we use a 128-bit result we might need to use a target specific node.
29425 unsigned SrcElts =
29426 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
29427 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
29428 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
29429 unsigned Opc = N->getOpcode();
29430 if (NumElts != SrcElts) {
29431 if (IsStrict)
29432 Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
29433 else
29434 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
29435 }
29436
29437 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
29438 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
29439 DAG.getConstantFP(0.0, dl, VecInVT), Src,
29440 ZeroIdx);
29441 SDValue Chain;
29442 if (IsStrict) {
29443 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
29444 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
29445 Chain = Res.getValue(1);
29446 } else
29447 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
29448 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
29449 Results.push_back(Res);
29450 if (IsStrict)
29451 Results.push_back(Chain);
29452 return;
29453 }
29454
29455 SDValue Chain;
29456 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
29457 Results.push_back(V);
29458 if (IsStrict)
29459 Results.push_back(Chain);
29460 }
29461 return;
29462 }
29463 case ISD::LRINT:
29464 case ISD::LLRINT: {
29465 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
29466 Results.push_back(V);
29467 return;
29468 }
29469
29470 case ISD::SINT_TO_FP:
29471 case ISD::STRICT_SINT_TO_FP:
29472 case ISD::UINT_TO_FP:
29473 case ISD::STRICT_UINT_TO_FP: {
29474 bool IsStrict = N->isStrictFPOpcode();
29475 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
29476 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
29477 EVT VT = N->getValueType(0);
29478 if (VT != MVT::v2f32)
29479 return;
29480 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
29481 EVT SrcVT = Src.getValueType();
29482 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
29483 if (IsStrict) {
29484 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
29485 : X86ISD::STRICT_CVTUI2P;
29486 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
29487 {N->getOperand(0), Src});
29488 Results.push_back(Res);
29489 Results.push_back(Res.getValue(1));
29490 } else {
29491 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
29492 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
29493 }
29494 return;
29495 }
29496 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
29497 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
29498 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
29499 SDValue One = DAG.getConstant(1, dl, SrcVT);
29500 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
29501 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
29502 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
29503 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
29504 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
29505 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
29506 for (int i = 0; i != 2; ++i) {
29507 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
29508 SignSrc, DAG.getIntPtrConstant(i, dl));
29509 if (IsStrict)
29510 SignCvts[i] =
29511 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
29512 {N->getOperand(0), Elt});
29513 else
29514 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
29515 };
29516 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
29517 SDValue Slow, Chain;
29518 if (IsStrict) {
29519 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
29520 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
29521 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
29522 {Chain, SignCvt, SignCvt});
29523 Chain = Slow.getValue(1);
29524 } else {
29525 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
29526 }
29527 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
29528 IsNeg =
29529 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
29530 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
29531 Results.push_back(Cvt);
29532 if (IsStrict)
29533 Results.push_back(Chain);
29534 return;
29535 }
29536
29537 if (SrcVT != MVT::v2i32)
29538 return;
29539
29540 if (IsSigned || Subtarget.hasAVX512()) {
29541 if (!IsStrict)
29542 return;
29543
29544 // Custom widen strict v2i32->v2f32 to avoid scalarization.
29545 // FIXME: Should generic type legalizer do this?
29546 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
29547 DAG.getConstant(0, dl, MVT::v2i32));
29548 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
29549 {N->getOperand(0), Src});
29550 Results.push_back(Res);
29551 Results.push_back(Res.getValue(1));
29552 return;
29553 }
29554
29555 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29555, __PRETTY_FUNCTION__))
;
29556 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
29557 SDValue VBias =
29558 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
29559 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
29560 DAG.getBitcast(MVT::v2i64, VBias));
29561 Or = DAG.getBitcast(MVT::v2f64, Or);
29562 if (IsStrict) {
29563 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
29564 {N->getOperand(0), Or, VBias});
29565 SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
29566 {MVT::v4f32, MVT::Other},
29567 {Sub.getValue(1), Sub});
29568 Results.push_back(Res);
29569 Results.push_back(Res.getValue(1));
29570 } else {
29571 // TODO: Are there any fast-math-flags to propagate here?
29572 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
29573 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
29574 }
29575 return;
29576 }
29577 case ISD::STRICT_FP_ROUND:
29578 case ISD::FP_ROUND: {
29579 bool IsStrict = N->isStrictFPOpcode();
29580 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
29581 if (!isTypeLegal(Src.getValueType()))
29582 return;
29583 SDValue V;
29584 if (IsStrict)
29585 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
29586 {N->getOperand(0), N->getOperand(1)});
29587 else
29588 V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
29589 Results.push_back(V);
29590 if (IsStrict)
29591 Results.push_back(V.getValue(1));
29592 return;
29593 }
29594 case ISD::FP_EXTEND:
29595 case ISD::STRICT_FP_EXTEND: {
29596 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
29597 // No other ValueType for FP_EXTEND should reach this point.
29598 assert(N->getValueType(0) == MVT::v2f32 &&((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29599, __PRETTY_FUNCTION__))
29599 "Do not know how to legalize this Node")((N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29599, __PRETTY_FUNCTION__))
;
29600 return;
29601 }
29602 case ISD::INTRINSIC_W_CHAIN: {
29603 unsigned IntNo = N->getConstantOperandVal(1);
29604 switch (IntNo) {
29605 default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29606)
29606 "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29606)
;
29607 case Intrinsic::x86_rdtsc:
29608 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
29609 Results);
29610 case Intrinsic::x86_rdtscp:
29611 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
29612 Results);
29613 case Intrinsic::x86_rdpmc:
29614 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
29615 Results);
29616 return;
29617 case Intrinsic::x86_xgetbv:
29618 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
29619 Results);
29620 return;
29621 }
29622 }
29623 case ISD::READCYCLECOUNTER: {
29624 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
29625 }
29626 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
29627 EVT T = N->getValueType(0);
29628 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"
) ? static_cast<void> (0) : __assert_fail ("(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29628, __PRETTY_FUNCTION__))
;
29629 bool Regs64bit = T == MVT::i128;
29630 assert((!Regs64bit || Subtarget.hasCmpxchg16b()) &&(((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? static_cast<void> (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29631, __PRETTY_FUNCTION__))
29631 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(((!Regs64bit || Subtarget.hasCmpxchg16b()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? static_cast<void> (0) : __assert_fail ("(!Regs64bit || Subtarget.hasCmpxchg16b()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29631, __PRETTY_FUNCTION__))
;
29632 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
29633 SDValue cpInL, cpInH;
29634 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
29635 DAG.getConstant(0, dl, HalfT));
29636 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
29637 DAG.getConstant(1, dl, HalfT));
29638 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
29639 Regs64bit ? X86::RAX : X86::EAX,
29640 cpInL, SDValue());
29641 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
29642 Regs64bit ? X86::RDX : X86::EDX,
29643 cpInH, cpInL.getValue(1));
29644 SDValue swapInL, swapInH;
29645 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
29646 DAG.getConstant(0, dl, HalfT));
29647 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
29648 DAG.getConstant(1, dl, HalfT));
29649 swapInH =
29650 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
29651 swapInH, cpInH.getValue(1));
29652 // If the current function needs the base pointer, RBX,
29653 // we shouldn't use cmpxchg directly.
29654 // Indeed the lowering of that instruction will clobber
29655 // that register and since RBX will be a reserved register
29656 // the register allocator will not make sure its value will
29657 // be properly saved and restored around this live-range.
29658 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
29659 SDValue Result;
29660 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
29661 Register BasePtr = TRI->getBaseRegister();
29662 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
29663 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
29664 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
29665 // ISel prefers the LCMPXCHG64 variant.
29666 // If that assert breaks, that means it is not the case anymore,
29667 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
29668 // not just EBX. This is a matter of accepting i64 input for that
29669 // pseudo, and restoring into the register of the right wide
29670 // in expand pseudo. Everything else should just work.
29671 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&((((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX
) && "Saving only half of the RBX") ? static_cast<
void> (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29672, __PRETTY_FUNCTION__))
29672 "Saving only half of the RBX")((((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX
) && "Saving only half of the RBX") ? static_cast<
void> (0) : __assert_fail ("((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) && \"Saving only half of the RBX\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29672, __PRETTY_FUNCTION__))
;
29673 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
29674 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
29675 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
29676 Regs64bit ? X86::RBX : X86::EBX,
29677 HalfT, swapInH.getValue(1));
29678 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
29679 RBXSave,
29680 /*Glue*/ RBXSave.getValue(2)};
29681 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
29682 } else {
29683 unsigned Opcode =
29684 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
29685 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
29686 Regs64bit ? X86::RBX : X86::EBX, swapInL,
29687 swapInH.getValue(1));
29688 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
29689 swapInL.getValue(1)};
29690 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
29691 }
29692 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
29693 Regs64bit ? X86::RAX : X86::EAX,
29694 HalfT, Result.getValue(1));
29695 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
29696 Regs64bit ? X86::RDX : X86::EDX,
29697 HalfT, cpOutL.getValue(2));
29698 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
29699
29700 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
29701 MVT::i32, cpOutH.getValue(2));
29702 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
29703 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
29704
29705 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
29706 Results.push_back(Success);
29707 Results.push_back(EFLAGS.getValue(1));
29708 return;
29709 }
29710 case ISD::ATOMIC_LOAD: {
29711 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")((N->getValueType(0) == MVT::i64 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29711, __PRETTY_FUNCTION__))
;
29712 bool NoImplicitFloatOps =
29713 DAG.getMachineFunction().getFunction().hasFnAttribute(
29714 Attribute::NoImplicitFloat);
29715 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
29716 auto *Node = cast<AtomicSDNode>(N);
29717 if (Subtarget.hasSSE1()) {
29718 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
29719 // Then extract the lower 64-bits.
29720 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
29721 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
29722 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
29723 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
29724 MVT::i64, Node->getMemOperand());
29725 if (Subtarget.hasSSE2()) {
29726 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
29727 DAG.getIntPtrConstant(0, dl));
29728 Results.push_back(Res);
29729 Results.push_back(Ld.getValue(1));
29730 return;
29731 }
29732 // We use an alternative sequence for SSE1 that extracts as v2f32 and
29733 // then casts to i64. This avoids a 128-bit stack temporary being
29734 // created by type legalization if we were to cast v4f32->v2i64.
29735 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
29736 DAG.getIntPtrConstant(0, dl));
29737 Res = DAG.getBitcast(MVT::i64, Res);
29738 Results.push_back(Res);
29739 Results.push_back(Ld.getValue(1));
29740 return;
29741 }
29742 if (Subtarget.hasX87()) {
29743 // First load this into an 80-bit X87 register. This will put the whole
29744 // integer into the significand.
29745 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
29746 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
29747 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
29748 dl, Tys, Ops, MVT::i64,
29749 Node->getMemOperand());
29750 SDValue Chain = Result.getValue(1);
29751
29752 // Now store the X87 register to a stack temporary and convert to i64.
29753 // This store is not atomic and doesn't need to be.
29754 // FIXME: We don't need a stack temporary if the result of the load
29755 // is already being stored. We could just directly store there.
29756 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
29757 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29758 MachinePointerInfo MPI =
29759 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
29760 SDValue StoreOps[] = { Chain, Result, StackPtr };
29761 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
29762 DAG.getVTList(MVT::Other), StoreOps,
29763 MVT::i64, MPI, 0 /*Align*/,
29764 MachineMemOperand::MOStore);
29765
29766 // Finally load the value back from the stack temporary and return it.
29767 // This load is not atomic and doesn't need to be.
29768 // This load will be further type legalized.
29769 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
29770 Results.push_back(Result);
29771 Results.push_back(Result.getValue(1));
29772 return;
29773 }
29774 }
29775 // TODO: Use MOVLPS when SSE1 is available?
29776 // Delegate to generic TypeLegalization. Situations we can really handle
29777 // should have already been dealt with by AtomicExpandPass.cpp.
29778 break;
29779 }
29780 case ISD::ATOMIC_SWAP:
29781 case ISD::ATOMIC_LOAD_ADD:
29782 case ISD::ATOMIC_LOAD_SUB:
29783 case ISD::ATOMIC_LOAD_AND:
29784 case ISD::ATOMIC_LOAD_OR:
29785 case ISD::ATOMIC_LOAD_XOR:
29786 case ISD::ATOMIC_LOAD_NAND:
29787 case ISD::ATOMIC_LOAD_MIN:
29788 case ISD::ATOMIC_LOAD_MAX:
29789 case ISD::ATOMIC_LOAD_UMIN:
29790 case ISD::ATOMIC_LOAD_UMAX:
29791 // Delegate to generic TypeLegalization. Situations we can really handle
29792 // should have already been dealt with by AtomicExpandPass.cpp.
29793 break;
29794
29795 case ISD::BITCAST: {
29796 assert(Subtarget.hasSSE2() && "Requires at least SSE2!")((Subtarget.hasSSE2() && "Requires at least SSE2!") ?
static_cast<void> (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29796, __PRETTY_FUNCTION__))
;
29797 EVT DstVT = N->getValueType(0);
29798 EVT SrcVT = N->getOperand(0).getValueType();
29799
29800 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
29801 // we can split using the k-register rather than memory.
29802 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
29803 assert(!Subtarget.is64Bit() && "Expected 32-bit mode")((!Subtarget.is64Bit() && "Expected 32-bit mode") ? static_cast
<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29803, __PRETTY_FUNCTION__))
;
29804 SDValue Lo, Hi;
29805 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
29806 Lo = DAG.getBitcast(MVT::i32, Lo);
29807 Hi = DAG.getBitcast(MVT::i32, Hi);
29808 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
29809 Results.push_back(Res);
29810 return;
29811 }
29812
29813 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
29814 if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
29815 SrcVT.isVector() && isTypeLegal(SrcVT)) {
29816 SDValue Lo, Hi;
29817 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
29818 MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
29819 Lo = DAG.getBitcast(CastVT, Lo);
29820 Hi = DAG.getBitcast(CastVT, Hi);
29821 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
29822 Results.push_back(Res);
29823 return;
29824 }
29825
29826 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
29827 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29828, __PRETTY_FUNCTION__))
29828 "Unexpected type action!")((getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29828, __PRETTY_FUNCTION__))
;
29829 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
29830 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
29831 Results.push_back(Res);
29832 return;
29833 }
29834
29835 return;
29836 }
29837 case ISD::MGATHER: {
29838 EVT VT = N->getValueType(0);
29839 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
29840 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
29841 auto *Gather = cast<MaskedGatherSDNode>(N);
29842 SDValue Index = Gather->getIndex();
29843 if (Index.getValueType() != MVT::v2i64)
29844 return;
29845 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29846, __PRETTY_FUNCTION__))
29846 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29846, __PRETTY_FUNCTION__))
;
29847 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
29848 SDValue Mask = Gather->getMask();
29849 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")((Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"
) ? static_cast<void> (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29849, __PRETTY_FUNCTION__))
;
29850 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
29851 Gather->getPassThru(),
29852 DAG.getUNDEF(VT));
29853 if (!Subtarget.hasVLX()) {
29854 // We need to widen the mask, but the instruction will only use 2
29855 // of its elements. So we can use undef.
29856 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
29857 DAG.getUNDEF(MVT::v2i1));
29858 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
29859 }
29860 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
29861 Gather->getBasePtr(), Index, Gather->getScale() };
29862 SDValue Res = DAG.getMemIntrinsicNode(
29863 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
29864 Gather->getMemoryVT(), Gather->getMemOperand());
29865 Results.push_back(Res);
29866 Results.push_back(Res.getValue(1));
29867 return;
29868 }
29869 return;
29870 }
29871 case ISD::LOAD: {
29872 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
29873 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
29874 // cast since type legalization will try to use an i64 load.
29875 MVT VT = N->getSimpleValueType(0);
29876 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")((VT.isVector() && VT.getSizeInBits() == 64 &&
"Unexpected VT") ? static_cast<void> (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29876, __PRETTY_FUNCTION__))
;
29877 assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29878, __PRETTY_FUNCTION__))
29878 "Unexpected type action!")((getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!") ? static_cast<void> (0) : __assert_fail
("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29878, __PRETTY_FUNCTION__))
;
29879 if (!ISD::isNON_EXTLoad(N))
29880 return;
29881 auto *Ld = cast<LoadSDNode>(N);
29882 if (Subtarget.hasSSE2()) {
29883 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
29884 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
29885 Ld->getPointerInfo(), Ld->getAlignment(),
29886 Ld->getMemOperand()->getFlags());
29887 SDValue Chain = Res.getValue(1);
29888 MVT VecVT = MVT::getVectorVT(LdVT, 2);
29889 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
29890 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
29891 Res = DAG.getBitcast(WideVT, Res);
29892 Results.push_back(Res);
29893 Results.push_back(Chain);
29894 return;
29895 }
29896 assert(Subtarget.hasSSE1() && "Expected SSE")((Subtarget.hasSSE1() && "Expected SSE") ? static_cast
<void> (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29896, __PRETTY_FUNCTION__))
;
29897 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
29898 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
29899 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
29900 MVT::i64, Ld->getMemOperand());
29901 Results.push_back(Res);
29902 Results.push_back(Res.getValue(1));
29903 return;
29904 }
29905 case ISD::ADDRSPACECAST: {
29906 SDValue Src = N->getOperand(0);
29907 EVT DstVT = N->getValueType(0);
29908 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
29909 unsigned SrcAS = CastN->getSrcAddressSpace();
29910
29911 assert(SrcAS != CastN->getDestAddressSpace() &&((SrcAS != CastN->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != CastN->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29912, __PRETTY_FUNCTION__))
29912 "addrspacecast must be between different address spaces")((SrcAS != CastN->getDestAddressSpace() && "addrspacecast must be between different address spaces"
) ? static_cast<void> (0) : __assert_fail ("SrcAS != CastN->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29912, __PRETTY_FUNCTION__))
;
29913
29914 SDValue Res;
29915 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)
29916 Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
29917 else if (DstVT == MVT::i64)
29918 Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
29919 else if (DstVT == MVT::i32)
29920 Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
29921 else
29922 report_fatal_error("Unrecognized addrspacecast type legalization");
29923
29924 Results.push_back(Res);
29925 return;
29926 }
29927 }
29928}
29929
29930const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
29931 switch ((X86ISD::NodeType)Opcode) {
29932 case X86ISD::FIRST_NUMBER: break;
29933#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
29934 NODE_NAME_CASE(BSF)
29935 NODE_NAME_CASE(BSR)
29936 NODE_NAME_CASE(SHLD)
29937 NODE_NAME_CASE(SHRD)
29938 NODE_NAME_CASE(FAND)
29939 NODE_NAME_CASE(FANDN)
29940 NODE_NAME_CASE(FOR)
29941 NODE_NAME_CASE(FXOR)
29942 NODE_NAME_CASE(FILD)
29943 NODE_NAME_CASE(FIST)
29944 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
29945 NODE_NAME_CASE(FLD)
29946 NODE_NAME_CASE(FST)
29947 NODE_NAME_CASE(CALL)
29948 NODE_NAME_CASE(BT)
29949 NODE_NAME_CASE(CMP)
29950 NODE_NAME_CASE(FCMP)
29951 NODE_NAME_CASE(STRICT_FCMP)
29952 NODE_NAME_CASE(STRICT_FCMPS)
29953 NODE_NAME_CASE(COMI)
29954 NODE_NAME_CASE(UCOMI)
29955 NODE_NAME_CASE(CMPM)
29956 NODE_NAME_CASE(STRICT_CMPM)
29957 NODE_NAME_CASE(CMPM_SAE)
29958 NODE_NAME_CASE(SETCC)
29959 NODE_NAME_CASE(SETCC_CARRY)
29960 NODE_NAME_CASE(FSETCC)
29961 NODE_NAME_CASE(FSETCCM)
29962 NODE_NAME_CASE(FSETCCM_SAE)
29963 NODE_NAME_CASE(CMOV)
29964 NODE_NAME_CASE(BRCOND)
29965 NODE_NAME_CASE(RET_FLAG)
29966 NODE_NAME_CASE(IRET)
29967 NODE_NAME_CASE(REP_STOS)
29968 NODE_NAME_CASE(REP_MOVS)
29969 NODE_NAME_CASE(GlobalBaseReg)
29970 NODE_NAME_CASE(Wrapper)
29971 NODE_NAME_CASE(WrapperRIP)
29972 NODE_NAME_CASE(MOVQ2DQ)
29973 NODE_NAME_CASE(MOVDQ2Q)
29974 NODE_NAME_CASE(MMX_MOVD2W)
29975 NODE_NAME_CASE(MMX_MOVW2D)
29976 NODE_NAME_CASE(PEXTRB)
29977 NODE_NAME_CASE(PEXTRW)
29978 NODE_NAME_CASE(INSERTPS)
29979 NODE_NAME_CASE(PINSRB)
29980 NODE_NAME_CASE(PINSRW)
29981 NODE_NAME_CASE(PSHUFB)
29982 NODE_NAME_CASE(ANDNP)
29983 NODE_NAME_CASE(BLENDI)
29984 NODE_NAME_CASE(BLENDV)
29985 NODE_NAME_CASE(HADD)
29986 NODE_NAME_CASE(HSUB)
29987 NODE_NAME_CASE(FHADD)
29988 NODE_NAME_CASE(FHSUB)
29989 NODE_NAME_CASE(CONFLICT)
29990 NODE_NAME_CASE(FMAX)
29991 NODE_NAME_CASE(FMAXS)
29992 NODE_NAME_CASE(FMAX_SAE)
29993 NODE_NAME_CASE(FMAXS_SAE)
29994 NODE_NAME_CASE(FMIN)
29995 NODE_NAME_CASE(FMINS)
29996 NODE_NAME_CASE(FMIN_SAE)
29997 NODE_NAME_CASE(FMINS_SAE)
29998 NODE_NAME_CASE(FMAXC)
29999 NODE_NAME_CASE(FMINC)
30000 NODE_NAME_CASE(FRSQRT)
30001 NODE_NAME_CASE(FRCP)
30002 NODE_NAME_CASE(EXTRQI)
30003 NODE_NAME_CASE(INSERTQI)
30004 NODE_NAME_CASE(TLSADDR)
30005 NODE_NAME_CASE(TLSBASEADDR)
30006 NODE_NAME_CASE(TLSCALL)
30007 NODE_NAME_CASE(EH_SJLJ_SETJMP)
30008 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
30009 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
30010 NODE_NAME_CASE(EH_RETURN)
30011 NODE_NAME_CASE(TC_RETURN)
30012 NODE_NAME_CASE(FNSTCW16m)
30013 NODE_NAME_CASE(LCMPXCHG_DAG)
30014 NODE_NAME_CASE(LCMPXCHG8_DAG)
30015 NODE_NAME_CASE(LCMPXCHG16_DAG)
30016 NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)
30017 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
30018 NODE_NAME_CASE(LADD)
30019 NODE_NAME_CASE(LSUB)
30020 NODE_NAME_CASE(LOR)
30021 NODE_NAME_CASE(LXOR)
30022 NODE_NAME_CASE(LAND)
30023 NODE_NAME_CASE(VZEXT_MOVL)
30024 NODE_NAME_CASE(VZEXT_LOAD)
30025 NODE_NAME_CASE(VEXTRACT_STORE)
30026 NODE_NAME_CASE(VTRUNC)
30027 NODE_NAME_CASE(VTRUNCS)
30028 NODE_NAME_CASE(VTRUNCUS)
30029 NODE_NAME_CASE(VMTRUNC)
30030 NODE_NAME_CASE(VMTRUNCS)
30031 NODE_NAME_CASE(VMTRUNCUS)
30032 NODE_NAME_CASE(VTRUNCSTORES)
30033 NODE_NAME_CASE(VTRUNCSTOREUS)
30034 NODE_NAME_CASE(VMTRUNCSTORES)
30035 NODE_NAME_CASE(VMTRUNCSTOREUS)
30036 NODE_NAME_CASE(VFPEXT)
30037 NODE_NAME_CASE(STRICT_VFPEXT)
30038 NODE_NAME_CASE(VFPEXT_SAE)
30039 NODE_NAME_CASE(VFPEXTS)
30040 NODE_NAME_CASE(VFPEXTS_SAE)
30041 NODE_NAME_CASE(VFPROUND)
30042 NODE_NAME_CASE(STRICT_VFPROUND)
30043 NODE_NAME_CASE(VMFPROUND)
30044 NODE_NAME_CASE(VFPROUND_RND)
30045 NODE_NAME_CASE(VFPROUNDS)
30046 NODE_NAME_CASE(VFPROUNDS_RND)
30047 NODE_NAME_CASE(VSHLDQ)
30048 NODE_NAME_CASE(VSRLDQ)
30049 NODE_NAME_CASE(VSHL)
30050 NODE_NAME_CASE(VSRL)
30051 NODE_NAME_CASE(VSRA)
30052 NODE_NAME_CASE(VSHLI)
30053 NODE_NAME_CASE(VSRLI)
30054 NODE_NAME_CASE(VSRAI)
30055 NODE_NAME_CASE(VSHLV)
30056 NODE_NAME_CASE(VSRLV)
30057 NODE_NAME_CASE(VSRAV)
30058 NODE_NAME_CASE(VROTLI)
30059 NODE_NAME_CASE(VROTRI)
30060 NODE_NAME_CASE(VPPERM)
30061 NODE_NAME_CASE(CMPP)
30062 NODE_NAME_CASE(STRICT_CMPP)
30063 NODE_NAME_CASE(PCMPEQ)
30064 NODE_NAME_CASE(PCMPGT)
30065 NODE_NAME_CASE(PHMINPOS)
30066 NODE_NAME_CASE(ADD)
30067 NODE_NAME_CASE(SUB)
30068 NODE_NAME_CASE(ADC)
30069 NODE_NAME_CASE(SBB)
30070 NODE_NAME_CASE(SMUL)
30071 NODE_NAME_CASE(UMUL)
30072 NODE_NAME_CASE(OR)
30073 NODE_NAME_CASE(XOR)
30074 NODE_NAME_CASE(AND)
30075 NODE_NAME_CASE(BEXTR)
30076 NODE_NAME_CASE(BZHI)
30077 NODE_NAME_CASE(MUL_IMM)
30078 NODE_NAME_CASE(MOVMSK)
30079 NODE_NAME_CASE(PTEST)
30080 NODE_NAME_CASE(TESTP)
30081 NODE_NAME_CASE(KORTEST)
30082 NODE_NAME_CASE(KTEST)
30083 NODE_NAME_CASE(KADD)
30084 NODE_NAME_CASE(KSHIFTL)
30085 NODE_NAME_CASE(KSHIFTR)
30086 NODE_NAME_CASE(PACKSS)
30087 NODE_NAME_CASE(PACKUS)
30088 NODE_NAME_CASE(PALIGNR)
30089 NODE_NAME_CASE(VALIGN)
30090 NODE_NAME_CASE(VSHLD)
30091 NODE_NAME_CASE(VSHRD)
30092 NODE_NAME_CASE(VSHLDV)
30093 NODE_NAME_CASE(VSHRDV)
30094 NODE_NAME_CASE(PSHUFD)
30095 NODE_NAME_CASE(PSHUFHW)
30096 NODE_NAME_CASE(PSHUFLW)
30097 NODE_NAME_CASE(SHUFP)
30098 NODE_NAME_CASE(SHUF128)
30099 NODE_NAME_CASE(MOVLHPS)
30100 NODE_NAME_CASE(MOVHLPS)
30101 NODE_NAME_CASE(MOVDDUP)
30102 NODE_NAME_CASE(MOVSHDUP)
30103 NODE_NAME_CASE(MOVSLDUP)
30104 NODE_NAME_CASE(MOVSD)
30105 NODE_NAME_CASE(MOVSS)
30106 NODE_NAME_CASE(UNPCKL)
30107 NODE_NAME_CASE(UNPCKH)
30108 NODE_NAME_CASE(VBROADCAST)
30109 NODE_NAME_CASE(VBROADCAST_LOAD)
30110 NODE_NAME_CASE(VBROADCASTM)
30111 NODE_NAME_CASE(SUBV_BROADCAST)
30112 NODE_NAME_CASE(VPERMILPV)
30113 NODE_NAME_CASE(VPERMILPI)
30114 NODE_NAME_CASE(VPERM2X128)
30115 NODE_NAME_CASE(VPERMV)
30116 NODE_NAME_CASE(VPERMV3)
30117 NODE_NAME_CASE(VPERMI)
30118 NODE_NAME_CASE(VPTERNLOG)
30119 NODE_NAME_CASE(VFIXUPIMM)
30120 NODE_NAME_CASE(VFIXUPIMM_SAE)
30121 NODE_NAME_CASE(VFIXUPIMMS)
30122 NODE_NAME_CASE(VFIXUPIMMS_SAE)
30123 NODE_NAME_CASE(VRANGE)
30124 NODE_NAME_CASE(VRANGE_SAE)
30125 NODE_NAME_CASE(VRANGES)
30126 NODE_NAME_CASE(VRANGES_SAE)
30127 NODE_NAME_CASE(PMULUDQ)
30128 NODE_NAME_CASE(PMULDQ)
30129 NODE_NAME_CASE(PSADBW)
30130 NODE_NAME_CASE(DBPSADBW)
30131 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
30132 NODE_NAME_CASE(VAARG_64)
30133 NODE_NAME_CASE(WIN_ALLOCA)
30134 NODE_NAME_CASE(MEMBARRIER)
30135 NODE_NAME_CASE(MFENCE)
30136 NODE_NAME_CASE(SEG_ALLOCA)
30137 NODE_NAME_CASE(PROBED_ALLOCA)
30138 NODE_NAME_CASE(RDRAND)
30139 NODE_NAME_CASE(RDSEED)
30140 NODE_NAME_CASE(RDPKRU)
30141 NODE_NAME_CASE(WRPKRU)
30142 NODE_NAME_CASE(VPMADDUBSW)
30143 NODE_NAME_CASE(VPMADDWD)
30144 NODE_NAME_CASE(VPSHA)
30145 NODE_NAME_CASE(VPSHL)
30146 NODE_NAME_CASE(VPCOM)
30147 NODE_NAME_CASE(VPCOMU)
30148 NODE_NAME_CASE(VPERMIL2)
30149 NODE_NAME_CASE(FMSUB)
30150 NODE_NAME_CASE(STRICT_FMSUB)
30151 NODE_NAME_CASE(FNMADD)
30152 NODE_NAME_CASE(STRICT_FNMADD)
30153 NODE_NAME_CASE(FNMSUB)
30154 NODE_NAME_CASE(STRICT_FNMSUB)
30155 NODE_NAME_CASE(FMADDSUB)
30156 NODE_NAME_CASE(FMSUBADD)
30157 NODE_NAME_CASE(FMADD_RND)
30158 NODE_NAME_CASE(FNMADD_RND)
30159 NODE_NAME_CASE(FMSUB_RND)
30160 NODE_NAME_CASE(FNMSUB_RND)
30161 NODE_NAME_CASE(FMADDSUB_RND)
30162 NODE_NAME_CASE(FMSUBADD_RND)
30163 NODE_NAME_CASE(VPMADD52H)
30164 NODE_NAME_CASE(VPMADD52L)
30165 NODE_NAME_CASE(VRNDSCALE)
30166 NODE_NAME_CASE(STRICT_VRNDSCALE)
30167 NODE_NAME_CASE(VRNDSCALE_SAE)
30168 NODE_NAME_CASE(VRNDSCALES)
30169 NODE_NAME_CASE(VRNDSCALES_SAE)
30170 NODE_NAME_CASE(VREDUCE)
30171 NODE_NAME_CASE(VREDUCE_SAE)
30172 NODE_NAME_CASE(VREDUCES)
30173 NODE_NAME_CASE(VREDUCES_SAE)
30174 NODE_NAME_CASE(VGETMANT)
30175 NODE_NAME_CASE(VGETMANT_SAE)
30176 NODE_NAME_CASE(VGETMANTS)
30177 NODE_NAME_CASE(VGETMANTS_SAE)
30178 NODE_NAME_CASE(PCMPESTR)
30179 NODE_NAME_CASE(PCMPISTR)
30180 NODE_NAME_CASE(XTEST)
30181 NODE_NAME_CASE(COMPRESS)
30182 NODE_NAME_CASE(EXPAND)
30183 NODE_NAME_CASE(SELECTS)
30184 NODE_NAME_CASE(ADDSUB)
30185 NODE_NAME_CASE(RCP14)
30186 NODE_NAME_CASE(RCP14S)
30187 NODE_NAME_CASE(RCP28)
30188 NODE_NAME_CASE(RCP28_SAE)
30189 NODE_NAME_CASE(RCP28S)
30190 NODE_NAME_CASE(RCP28S_SAE)
30191 NODE_NAME_CASE(EXP2)
30192 NODE_NAME_CASE(EXP2_SAE)
30193 NODE_NAME_CASE(RSQRT14)
30194 NODE_NAME_CASE(RSQRT14S)
30195 NODE_NAME_CASE(RSQRT28)
30196 NODE_NAME_CASE(RSQRT28_SAE)
30197 NODE_NAME_CASE(RSQRT28S)
30198 NODE_NAME_CASE(RSQRT28S_SAE)
30199 NODE_NAME_CASE(FADD_RND)
30200 NODE_NAME_CASE(FADDS)
30201 NODE_NAME_CASE(FADDS_RND)
30202 NODE_NAME_CASE(FSUB_RND)
30203 NODE_NAME_CASE(FSUBS)
30204 NODE_NAME_CASE(FSUBS_RND)
30205 NODE_NAME_CASE(FMUL_RND)
30206 NODE_NAME_CASE(FMULS)
30207 NODE_NAME_CASE(FMULS_RND)
30208 NODE_NAME_CASE(FDIV_RND)
30209 NODE_NAME_CASE(FDIVS)
30210 NODE_NAME_CASE(FDIVS_RND)
30211 NODE_NAME_CASE(FSQRT_RND)
30212 NODE_NAME_CASE(FSQRTS)
30213 NODE_NAME_CASE(FSQRTS_RND)
30214 NODE_NAME_CASE(FGETEXP)
30215 NODE_NAME_CASE(FGETEXP_SAE)
30216 NODE_NAME_CASE(FGETEXPS)
30217 NODE_NAME_CASE(FGETEXPS_SAE)
30218 NODE_NAME_CASE(SCALEF)
30219 NODE_NAME_CASE(SCALEF_RND)
30220 NODE_NAME_CASE(SCALEFS)
30221 NODE_NAME_CASE(SCALEFS_RND)
30222 NODE_NAME_CASE(AVG)
30223 NODE_NAME_CASE(MULHRS)
30224 NODE_NAME_CASE(SINT_TO_FP_RND)
30225 NODE_NAME_CASE(UINT_TO_FP_RND)
30226 NODE_NAME_CASE(CVTTP2SI)
30227 NODE_NAME_CASE(CVTTP2UI)
30228 NODE_NAME_CASE(STRICT_CVTTP2SI)
30229 NODE_NAME_CASE(STRICT_CVTTP2UI)
30230 NODE_NAME_CASE(MCVTTP2SI)
30231 NODE_NAME_CASE(MCVTTP2UI)
30232 NODE_NAME_CASE(CVTTP2SI_SAE)
30233 NODE_NAME_CASE(CVTTP2UI_SAE)
30234 NODE_NAME_CASE(CVTTS2SI)
30235 NODE_NAME_CASE(CVTTS2UI)
30236 NODE_NAME_CASE(CVTTS2SI_SAE)
30237 NODE_NAME_CASE(CVTTS2UI_SAE)
30238 NODE_NAME_CASE(CVTSI2P)
30239 NODE_NAME_CASE(CVTUI2P)
30240 NODE_NAME_CASE(STRICT_CVTSI2P)
30241 NODE_NAME_CASE(STRICT_CVTUI2P)
30242 NODE_NAME_CASE(MCVTSI2P)
30243 NODE_NAME_CASE(MCVTUI2P)
30244 NODE_NAME_CASE(VFPCLASS)
30245 NODE_NAME_CASE(VFPCLASSS)
30246 NODE_NAME_CASE(MULTISHIFT)
30247 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
30248 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
30249 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
30250 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
30251 NODE_NAME_CASE(CVTPS2PH)
30252 NODE_NAME_CASE(STRICT_CVTPS2PH)
30253 NODE_NAME_CASE(MCVTPS2PH)
30254 NODE_NAME_CASE(CVTPH2PS)
30255 NODE_NAME_CASE(STRICT_CVTPH2PS)
30256 NODE_NAME_CASE(CVTPH2PS_SAE)
30257 NODE_NAME_CASE(CVTP2SI)
30258 NODE_NAME_CASE(CVTP2UI)
30259 NODE_NAME_CASE(MCVTP2SI)
30260 NODE_NAME_CASE(MCVTP2UI)
30261 NODE_NAME_CASE(CVTP2SI_RND)
30262 NODE_NAME_CASE(CVTP2UI_RND)
30263 NODE_NAME_CASE(CVTS2SI)
30264 NODE_NAME_CASE(CVTS2UI)
30265 NODE_NAME_CASE(CVTS2SI_RND)
30266 NODE_NAME_CASE(CVTS2UI_RND)
30267 NODE_NAME_CASE(CVTNE2PS2BF16)
30268 NODE_NAME_CASE(CVTNEPS2BF16)
30269 NODE_NAME_CASE(MCVTNEPS2BF16)
30270 NODE_NAME_CASE(DPBF16PS)
30271 NODE_NAME_CASE(LWPINS)
30272 NODE_NAME_CASE(MGATHER)
30273 NODE_NAME_CASE(MSCATTER)
30274 NODE_NAME_CASE(VPDPBUSD)
30275 NODE_NAME_CASE(VPDPBUSDS)
30276 NODE_NAME_CASE(VPDPWSSD)
30277 NODE_NAME_CASE(VPDPWSSDS)
30278 NODE_NAME_CASE(VPSHUFBITQMB)
30279 NODE_NAME_CASE(GF2P8MULB)
30280 NODE_NAME_CASE(GF2P8AFFINEQB)
30281 NODE_NAME_CASE(GF2P8AFFINEINVQB)
30282 NODE_NAME_CASE(NT_CALL)
30283 NODE_NAME_CASE(NT_BRIND)
30284 NODE_NAME_CASE(UMWAIT)
30285 NODE_NAME_CASE(TPAUSE)
30286 NODE_NAME_CASE(ENQCMD)
30287 NODE_NAME_CASE(ENQCMDS)
30288 NODE_NAME_CASE(VP2INTERSECT)
30289 }
30290 return nullptr;
30291#undef NODE_NAME_CASE
30292}
30293
30294/// Return true if the addressing mode represented by AM is legal for this
30295/// target, for a load/store of the specified type.
30296bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
30297 const AddrMode &AM, Type *Ty,
30298 unsigned AS,
30299 Instruction *I) const {
30300 // X86 supports extremely general addressing modes.
30301 CodeModel::Model M = getTargetMachine().getCodeModel();
30302
30303 // X86 allows a sign-extended 32-bit immediate field as a displacement.
30304 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
30305 return false;
30306
30307 if (AM.BaseGV) {
30308 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
30309
30310 // If a reference to this global requires an extra load, we can't fold it.
30311 if (isGlobalStubReference(GVFlags))
30312 return false;
30313
30314 // If BaseGV requires a register for the PIC base, we cannot also have a
30315 // BaseReg specified.
30316 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
30317 return false;
30318
30319 // If lower 4G is not available, then we must use rip-relative addressing.
30320 if ((M != CodeModel::Small || isPositionIndependent()) &&
30321 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
30322 return false;
30323 }
30324
30325 switch (AM.Scale) {
30326 case 0:
30327 case 1:
30328 case 2:
30329 case 4:
30330 case 8:
30331 // These scales always work.
30332 break;
30333 case 3:
30334 case 5:
30335 case 9:
30336 // These scales are formed with basereg+scalereg. Only accept if there is
30337 // no basereg yet.
30338 if (AM.HasBaseReg)
30339 return false;
30340 break;
30341 default: // Other stuff never works.
30342 return false;
30343 }
30344
30345 return true;
30346}
30347
30348bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
30349 unsigned Bits = Ty->getScalarSizeInBits();
30350
30351 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
30352 // particularly cheaper than those without.
30353 if (Bits == 8)
30354 return false;
30355
30356 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
30357 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
30358 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
30359 return false;
30360
30361 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
30362 // shifts just as cheap as scalar ones.
30363 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
30364 return false;
30365
30366 // AVX512BW has shifts such as vpsllvw.
30367 if (Subtarget.hasBWI() && Bits == 16)
30368 return false;
30369
30370 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
30371 // fully general vector.
30372 return true;
30373}
30374
30375bool X86TargetLowering::isBinOp(unsigned Opcode) const {
30376 switch (Opcode) {
30377 // These are non-commutative binops.
30378 // TODO: Add more X86ISD opcodes once we have test coverage.
30379 case X86ISD::ANDNP:
30380 case X86ISD::PCMPGT:
30381 case X86ISD::FMAX:
30382 case X86ISD::FMIN:
30383 case X86ISD::FANDN:
30384 return true;
30385 }
30386
30387 return TargetLoweringBase::isBinOp(Opcode);
30388}
30389
30390bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
30391 switch (Opcode) {
30392 // TODO: Add more X86ISD opcodes once we have test coverage.
30393 case X86ISD::PCMPEQ:
30394 case X86ISD::PMULDQ:
30395 case X86ISD::PMULUDQ:
30396 case X86ISD::FMAXC:
30397 case X86ISD::FMINC:
30398 case X86ISD::FAND:
30399 case X86ISD::FOR:
30400 case X86ISD::FXOR:
30401 return true;
30402 }
30403
30404 return TargetLoweringBase::isCommutativeBinOp(Opcode);
30405}
30406
30407bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
30408 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
30409 return false;
30410 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
30411 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
30412 return NumBits1 > NumBits2;
30413}
30414
30415bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
30416 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
30417 return false;
30418
30419 if (!isTypeLegal(EVT::getEVT(Ty1)))
30420 return false;
30421
30422 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")((Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"
) ? static_cast<void> (0) : __assert_fail ("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30422, __PRETTY_FUNCTION__))
;
30423
30424 // Assuming the caller doesn't have a zeroext or signext return parameter,
30425 // truncation all the way down to i1 is valid.
30426 return true;
30427}
30428
30429bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
30430 return isInt<32>(Imm);
30431}
30432
30433bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
30434 // Can also use sub to handle negated immediates.
30435 return isInt<32>(Imm);
30436}
30437
30438bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
30439 return isInt<32>(Imm);
30440}
30441
30442bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
30443 if (!VT1.isInteger() || !VT2.isInteger())
30444 return false;
30445 unsigned NumBits1 = VT1.getSizeInBits();
30446 unsigned NumBits2 = VT2.getSizeInBits();
30447 return NumBits1 > NumBits2;
30448}
30449
30450bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
30451 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
30452 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
30453}
30454
30455bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
30456 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
30457 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
30458}
30459
30460bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
30461 EVT VT1 = Val.getValueType();
30462 if (isZExtFree(VT1, VT2))
30463 return true;
30464
30465 if (Val.getOpcode() != ISD::LOAD)
30466 return false;
30467
30468 if (!VT1.isSimple() || !VT1.isInteger() ||
30469 !VT2.isSimple() || !VT2.isInteger())
30470 return false;
30471
30472 switch (VT1.getSimpleVT().SimpleTy) {
30473 default: break;
30474 case MVT::i8:
30475 case MVT::i16:
30476 case MVT::i32:
30477 // X86 has 8, 16, and 32-bit zero-extending loads.
30478 return true;
30479 }
30480
30481 return false;
30482}
30483
30484bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
30485 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
30486 return false;
30487
30488 EVT SrcVT = ExtVal.getOperand(0).getValueType();
30489
30490 // There is no extending load for vXi1.
30491 if (SrcVT.getScalarType() == MVT::i1)
30492 return false;
30493
30494 return true;
30495}
30496
30497bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
30498 EVT VT) const {
30499 if (!Subtarget.hasAnyFMA())
30500 return false;
30501
30502 VT = VT.getScalarType();
30503
30504 if (!VT.isSimple())
30505 return false;
30506
30507 switch (VT.getSimpleVT().SimpleTy) {
30508 case MVT::f32:
30509 case MVT::f64:
30510 return true;
30511 default:
30512 break;
30513 }
30514
30515 return false;
30516}
30517
30518bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
30519 // i16 instructions are longer (0x66 prefix) and potentially slower.
30520 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
30521}
30522
30523/// Targets can use this to indicate that they only support *some*
30524/// VECTOR_SHUFFLE operations, those with specific masks.
30525/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
30526/// are assumed to be legal.
30527bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
30528 if (!VT.isSimple())
30529 return false;
30530
30531 // Not for i1 vectors
30532 if (VT.getSimpleVT().getScalarType() == MVT::i1)
30533 return false;
30534
30535 // Very little shuffling can be done for 64-bit vectors right now.
30536 if (VT.getSimpleVT().getSizeInBits() == 64)
30537 return false;
30538
30539 // We only care that the types being shuffled are legal. The lowering can
30540 // handle any possible shuffle mask that results.
30541 return isTypeLegal(VT.getSimpleVT());
30542}
30543
30544bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
30545 EVT VT) const {
30546 // Don't convert an 'and' into a shuffle that we don't directly support.
30547 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
30548 if (!Subtarget.hasAVX2())
30549 if (VT == MVT::v32i8 || VT == MVT::v16i16)
30550 return false;
30551
30552 // Just delegate to the generic legality, clear masks aren't special.
30553 return isShuffleMaskLegal(Mask, VT);
30554}
30555
30556bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
30557 // If the subtarget is using retpolines, we need to not generate jump tables.
30558 if (Subtarget.useRetpolineIndirectBranches())
30559 return false;
30560
30561 // Otherwise, fallback on the generic logic.
30562 return TargetLowering::areJTsAllowed(Fn);
30563}
30564
30565//===----------------------------------------------------------------------===//
30566// X86 Scheduler Hooks
30567//===----------------------------------------------------------------------===//
30568
30569/// Utility function to emit xbegin specifying the start of an RTM region.
30570static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
30571 const TargetInstrInfo *TII) {
30572 DebugLoc DL = MI.getDebugLoc();
30573
30574 const BasicBlock *BB = MBB->getBasicBlock();
30575 MachineFunction::iterator I = ++MBB->getIterator();
30576
30577 // For the v = xbegin(), we generate
30578 //
30579 // thisMBB:
30580 // xbegin sinkMBB
30581 //
30582 // mainMBB:
30583 // s0 = -1
30584 //
30585 // fallBB:
30586 // eax = # XABORT_DEF
30587 // s1 = eax
30588 //
30589 // sinkMBB:
30590 // v = phi(s0/mainBB, s1/fallBB)
30591
30592 MachineBasicBlock *thisMBB = MBB;
30593 MachineFunction *MF = MBB->getParent();
30594 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
30595 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
30596 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
30597 MF->insert(I, mainMBB);
30598 MF->insert(I, fallMBB);
30599 MF->insert(I, sinkMBB);
30600
30601 // Transfer the remainder of BB and its successor edges to sinkMBB.
30602 sinkMBB->splice(sinkMBB->begin(), MBB,
30603 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
30604 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
30605
30606 MachineRegisterInfo &MRI = MF->getRegInfo();
30607 Register DstReg = MI.getOperand(0).getReg();
30608 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
30609 Register mainDstReg = MRI.createVirtualRegister(RC);
30610 Register fallDstReg = MRI.createVirtualRegister(RC);
30611
30612 // thisMBB:
30613 // xbegin fallMBB
30614 // # fallthrough to mainMBB
30615 // # abortion to fallMBB
30616 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
30617 thisMBB->addSuccessor(mainMBB);
30618 thisMBB->addSuccessor(fallMBB);
30619
30620 // mainMBB:
30621 // mainDstReg := -1
30622 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
30623 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
30624 mainMBB->addSuccessor(sinkMBB);
30625
30626 // fallMBB:
30627 // ; pseudo instruction to model hardware's definition from XABORT
30628 // EAX := XABORT_DEF
30629 // fallDstReg := EAX
30630 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
30631 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
30632 .addReg(X86::EAX);
30633 fallMBB->addSuccessor(sinkMBB);
30634
30635 // sinkMBB:
30636 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
30637 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
30638 .addReg(mainDstReg).addMBB(mainMBB)
30639 .addReg(fallDstReg).addMBB(fallMBB);
30640
30641 MI.eraseFromParent();
30642 return sinkMBB;
30643}
30644
30645
30646
30647MachineBasicBlock *
30648X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
30649 MachineBasicBlock *MBB) const {
30650 // Emit va_arg instruction on X86-64.
30651
30652 // Operands to this pseudo-instruction:
30653 // 0 ) Output : destination address (reg)
30654 // 1-5) Input : va_list address (addr, i64mem)
30655 // 6 ) ArgSize : Size (in bytes) of vararg type
30656 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
30657 // 8 ) Align : Alignment of type
30658 // 9 ) EFLAGS (implicit-def)
30659
30660 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!")((MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!"
) ? static_cast<void> (0) : __assert_fail ("MI.getNumOperands() == 10 && \"VAARG_64 should have 10 operands!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30660, __PRETTY_FUNCTION__))
;
30661 static_assert(X86::AddrNumOperands == 5,
30662 "VAARG_64 assumes 5 address operands");
30663
30664 Register DestReg = MI.getOperand(0).getReg();
30665 MachineOperand &Base = MI.getOperand(1);
30666 MachineOperand &Scale = MI.getOperand(2);
30667 MachineOperand &Index = MI.getOperand(3);
30668 MachineOperand &Disp = MI.getOperand(4);
30669 MachineOperand &Segment = MI.getOperand(5);
30670 unsigned ArgSize = MI.getOperand(6).getImm();
30671 unsigned ArgMode = MI.getOperand(7).getImm();
30672 unsigned Align = MI.getOperand(8).getImm();
30673
30674 MachineFunction *MF = MBB->getParent();
30675
30676 // Memory Reference
30677 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand")((MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"
) ? static_cast<void> (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG_64 to have one memoperand\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30677, __PRETTY_FUNCTION__))
;
30678
30679 MachineMemOperand *OldMMO = MI.memoperands().front();
30680
30681 // Clone the MMO into two separate MMOs for loading and storing
30682 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
30683 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
30684 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
30685 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
30686
30687 // Machine Information
30688 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
30689 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
30690 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
30691 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
30692 DebugLoc DL = MI.getDebugLoc();
30693
30694 // struct va_list {
30695 // i32 gp_offset
30696 // i32 fp_offset
30697 // i64 overflow_area (address)
30698 // i64 reg_save_area (address)
30699 // }
30700 // sizeof(va_list) = 24
30701 // alignment(va_list) = 8
30702
30703 unsigned TotalNumIntRegs = 6;
30704 unsigned TotalNumXMMRegs = 8;
30705 bool UseGPOffset = (ArgMode == 1);
30706 bool UseFPOffset = (ArgMode == 2);
30707 unsigned MaxOffset = TotalNumIntRegs * 8 +
30708 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
30709
30710 /* Align ArgSize to a multiple of 8 */
30711 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
30712 bool NeedsAlign = (Align > 8);
30713
30714 MachineBasicBlock *thisMBB = MBB;
30715 MachineBasicBlock *overflowMBB;
30716 MachineBasicBlock *offsetMBB;
30717 MachineBasicBlock *endMBB;
30718
30719 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
30720 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
30721 unsigned OffsetReg = 0;
30722
30723 if (!UseGPOffset && !UseFPOffset) {
30724 // If we only pull from the overflow region, we don't create a branch.
30725 // We don't need to alter control flow.
30726 OffsetDestReg = 0; // unused
30727 OverflowDestReg = DestReg;
30728
30729 offsetMBB = nullptr;
30730 overflowMBB = thisMBB;
30731 endMBB = thisMBB;
30732 } else {
30733 // First emit code to check if gp_offset (or fp_offset) is below the bound.
30734 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
30735 // If not, pull from overflow_area. (branch to overflowMBB)
30736 //
30737 // thisMBB
30738 // | .
30739 // | .
30740 // offsetMBB overflowMBB
30741 // | .
30742 // | .
30743 // endMBB
30744
30745 // Registers for the PHI in endMBB
30746 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
30747 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
30748
30749 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
30750 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
30751 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
30752 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
30753
30754 MachineFunction::iterator MBBIter = ++MBB->getIterator();
30755
30756 // Insert the new basic blocks
30757 MF->insert(MBBIter, offsetMBB);
30758 MF->insert(MBBIter, overflowMBB);
30759 MF->insert(MBBIter, endMBB);
30760
30761 // Transfer the remainder of MBB and its successor edges to endMBB.
30762 endMBB->splice(endMBB->begin(), thisMBB,
30763 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
30764 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
30765
30766 // Make offsetMBB and overflowMBB successors of thisMBB
30767 thisMBB->addSuccessor(offsetMBB);
30768 thisMBB->addSuccessor(overflowMBB);
30769
30770 // endMBB is a successor of both offsetMBB and overflowMBB
30771 offsetMBB->addSuccessor(endMBB);
30772 overflowMBB->addSuccessor(endMBB);
30773
30774 // Load the offset value into a register
30775 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
30776 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
30777 .add(Base)
30778 .add(Scale)
30779 .add(Index)
30780 .addDisp(Disp, UseFPOffset ? 4 : 0)
30781 .add(Segment)
30782 .setMemRefs(LoadOnlyMMO);
30783
30784 // Check if there is enough room left to pull this argument.
30785 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
30786 .addReg(OffsetReg)
30787 .addImm(MaxOffset + 8 - ArgSizeA8);
30788
30789 // Branch to "overflowMBB" if offset >= max
30790 // Fall through to "offsetMBB" otherwise
30791 BuildMI(thisMBB, DL, TII->get(X86::JCC_1))
30792 .addMBB(overflowMBB).addImm(X86::COND_AE);
30793 }
30794
30795 // In offsetMBB, emit code to use the reg_save_area.
30796 if (offsetMBB) {
30797 assert(OffsetReg != 0)((OffsetReg != 0) ? static_cast<void> (0) : __assert_fail
("OffsetReg != 0", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30797, __PRETTY_FUNCTION__))
;
30798
30799 // Read the reg_save_area address.
30800 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
30801 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
30802 .add(Base)
30803 .add(Scale)
30804 .add(Index)
30805 .addDisp(Disp, 16)
30806 .add(Segment)
30807 .setMemRefs(LoadOnlyMMO);
30808
30809 // Zero-extend the offset
30810 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
30811 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
30812 .addImm(0)
30813 .addReg(OffsetReg)
30814 .addImm(X86::sub_32bit);
30815
30816 // Add the offset to the reg_save_area to get the final address.
30817 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
30818 .addReg(OffsetReg64)
30819 .addReg(RegSaveReg);
30820
30821 // Compute the offset for the next argument
30822 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
30823 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
30824 .addReg(OffsetReg)
30825 .addImm(UseFPOffset ? 16 : 8);
30826
30827 // Store it back into the va_list.
30828 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
30829 .add(Base)
30830 .add(Scale)
30831 .add(Index)
30832 .addDisp(Disp, UseFPOffset ? 4 : 0)
30833 .add(Segment)
30834 .addReg(NextOffsetReg)
30835 .setMemRefs(StoreOnlyMMO);
30836
30837 // Jump to endMBB
30838 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
30839 .addMBB(endMBB);
30840 }
30841
30842 //
30843 // Emit code to use overflow area
30844 //
30845
30846 // Load the overflow_area address into a register.
30847 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
30848 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
30849 .add(Base)
30850 .add(Scale)
30851 .add(Index)
30852 .addDisp(Disp, 8)
30853 .add(Segment)
30854 .setMemRefs(LoadOnlyMMO);
30855
30856 // If we need to align it, do so. Otherwise, just copy the address
30857 // to OverflowDestReg.
30858 if (NeedsAlign) {
30859 // Align the overflow address
30860 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2")((isPowerOf2_32(Align) && "Alignment must be a power of 2"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(Align) && \"Alignment must be a power of 2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30860, __PRETTY_FUNCTION__))
;
30861 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
30862
30863 // aligned_addr = (addr + (align-1)) & ~(align-1)
30864 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
30865 .addReg(OverflowAddrReg)
30866 .addImm(Align-1);
30867
30868 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
30869 .addReg(TmpReg)
30870 .addImm(~(uint64_t)(Align-1));
30871 } else {
30872 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
30873 .addReg(OverflowAddrReg);
30874 }
30875
30876 // Compute the next overflow address after this argument.
30877 // (the overflow address should be kept 8-byte aligned)
30878 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
30879 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
30880 .addReg(OverflowDestReg)
30881 .addImm(ArgSizeA8);
30882
30883 // Store the new overflow address.
30884 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
30885 .add(Base)
30886 .add(Scale)
30887 .add(Index)
30888 .addDisp(Disp, 8)
30889 .add(Segment)
30890 .addReg(NextAddrReg)
30891 .setMemRefs(StoreOnlyMMO);
30892
30893 // If we branched, emit the PHI to the front of endMBB.
30894 if (offsetMBB) {
30895 BuildMI(*endMBB, endMBB->begin(), DL,
30896 TII->get(X86::PHI), DestReg)
30897 .addReg(OffsetDestReg).addMBB(offsetMBB)
30898 .addReg(OverflowDestReg).addMBB(overflowMBB);
30899 }
30900
30901 // Erase the pseudo instruction
30902 MI.eraseFromParent();
30903
30904 return endMBB;
30905}
30906
30907MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
30908 MachineInstr &MI, MachineBasicBlock *MBB) const {
30909 // Emit code to save XMM registers to the stack. The ABI says that the
30910 // number of registers to save is given in %al, so it's theoretically
30911 // possible to do an indirect jump trick to avoid saving all of them,
30912 // however this code takes a simpler approach and just executes all
30913 // of the stores if %al is non-zero. It's less code, and it's probably
30914 // easier on the hardware branch predictor, and stores aren't all that
30915 // expensive anyway.
30916
30917 // Create the new basic blocks. One block contains all the XMM stores,
30918 // and one block is the final destination regardless of whether any
30919 // stores were performed.
30920 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
30921 MachineFunction *F = MBB->getParent();
30922 MachineFunction::iterator MBBIter = ++MBB->getIterator();
30923 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
30924 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
30925 F->insert(MBBIter, XMMSaveMBB);
30926 F->insert(MBBIter, EndMBB);
30927
30928 // Transfer the remainder of MBB and its successor edges to EndMBB.
30929 EndMBB->splice(EndMBB->begin(), MBB,
30930 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
30931 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
30932
30933 // The original block will now fall through to the XMM save block.
30934 MBB->addSuccessor(XMMSaveMBB);
30935 // The XMMSaveMBB will fall through to the end block.
30936 XMMSaveMBB->addSuccessor(EndMBB);
30937
30938 // Now add the instructions.
30939 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
30940 DebugLoc DL = MI.getDebugLoc();
30941
30942 Register CountReg = MI.getOperand(0).getReg();
30943 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
30944 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
30945
30946 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
30947 // If %al is 0, branch around the XMM save block.
30948 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
30949 BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(EndMBB).addImm(X86::COND_E);
30950 MBB->addSuccessor(EndMBB);
30951 }
30952
30953 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
30954 // that was just emitted, but clearly shouldn't be "saved".
30955 assert((MI.getNumOperands() <= 3 ||(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30958, __PRETTY_FUNCTION__))
30956 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30958, __PRETTY_FUNCTION__))
30957 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30958, __PRETTY_FUNCTION__))
30958 "Expected last argument to be EFLAGS")(((MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands
() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg
() == X86::EFLAGS) && "Expected last argument to be EFLAGS"
) ? static_cast<void> (0) : __assert_fail ("(MI.getNumOperands() <= 3 || !MI.getOperand(MI.getNumOperands() - 1).isReg() || MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) && \"Expected last argument to be EFLAGS\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30958, __PRETTY_FUNCTION__))
;
30959 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
30960 // In the XMM save block, save all the XMM argument registers.
30961 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
30962 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
30963 MachineMemOperand *MMO = F->getMachineMemOperand(
30964 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
30965 MachineMemOperand::MOStore,
30966 /*Size=*/16, /*Align=*/16);
30967 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
30968 .addFrameIndex(RegSaveFrameIndex)
30969 .addImm(/*Scale=*/1)
30970 .addReg(/*IndexReg=*/0)
30971 .addImm(/*Disp=*/Offset)
30972 .addReg(/*Segment=*/0)
30973 .addReg(MI.getOperand(i).getReg())
30974 .addMemOperand(MMO);
30975 }
30976
30977 MI.eraseFromParent(); // The pseudo instruction is gone now.
30978
30979 return EndMBB;
30980}
30981
30982// The EFLAGS operand of SelectItr might be missing a kill marker
30983// because there were multiple uses of EFLAGS, and ISel didn't know
30984// which to mark. Figure out whether SelectItr should have had a
30985// kill marker, and set it if it should. Returns the correct kill
30986// marker value.
30987static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
30988 MachineBasicBlock* BB,
30989 const TargetRegisterInfo* TRI) {
30990 // Scan forward through BB for a use/def of EFLAGS.
30991 MachineBasicBlock::iterator miI(std::next(SelectItr));
30992 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
30993 const MachineInstr& mi = *miI;
30994 if (mi.readsRegister(X86::EFLAGS))
30995 return false;
30996 if (mi.definesRegister(X86::EFLAGS))
30997 break; // Should have kill-flag - update below.
30998 }
30999
31000 // If we hit the end of the block, check whether EFLAGS is live into a
31001 // successor.
31002 if (miI == BB->end()) {
31003 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
31004 sEnd = BB->succ_end();
31005 sItr != sEnd; ++sItr) {
31006 MachineBasicBlock* succ = *sItr;
31007 if (succ->isLiveIn(X86::EFLAGS))
31008 return false;
31009 }
31010 }
31011
31012 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
31013 // out. SelectMI should have a kill flag on EFLAGS.
31014 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
31015 return true;
31016}
31017
31018// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
31019// together with other CMOV pseudo-opcodes into a single basic-block with
31020// conditional jump around it.
31021static bool isCMOVPseudo(MachineInstr &MI) {
31022 switch (MI.getOpcode()) {
31023 case X86::CMOV_FR32:
31024 case X86::CMOV_FR32X:
31025 case X86::CMOV_FR64:
31026 case X86::CMOV_FR64X:
31027 case X86::CMOV_GR8:
31028 case X86::CMOV_GR16:
31029 case X86::CMOV_GR32:
31030 case X86::CMOV_RFP32:
31031 case X86::CMOV_RFP64:
31032 case X86::CMOV_RFP80:
31033 case X86::CMOV_VR64:
31034 case X86::CMOV_VR128:
31035 case X86::CMOV_VR128X:
31036 case X86::CMOV_VR256:
31037 case X86::CMOV_VR256X:
31038 case X86::CMOV_VR512:
31039 case X86::CMOV_VK1:
31040 case X86::CMOV_VK2:
31041 case X86::CMOV_VK4:
31042 case X86::CMOV_VK8:
31043 case X86::CMOV_VK16:
31044 case X86::CMOV_VK32:
31045 case X86::CMOV_VK64:
31046 return true;
31047
31048 default:
31049 return false;
31050 }
31051}
31052
31053// Helper function, which inserts PHI functions into SinkMBB:
31054// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
31055// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
31056// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
31057// the last PHI function inserted.
31058static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
31059 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
31060 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
31061 MachineBasicBlock *SinkMBB) {
31062 MachineFunction *MF = TrueMBB->getParent();
31063 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
31064 DebugLoc DL = MIItBegin->getDebugLoc();
31065
31066 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
31067 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
31068
31069 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
31070
31071 // As we are creating the PHIs, we have to be careful if there is more than
31072 // one. Later CMOVs may reference the results of earlier CMOVs, but later
31073 // PHIs have to reference the individual true/false inputs from earlier PHIs.
31074 // That also means that PHI construction must work forward from earlier to
31075 // later, and that the code must maintain a mapping from earlier PHI's
31076 // destination registers, and the registers that went into the PHI.
31077 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
31078 MachineInstrBuilder MIB;
31079
31080 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
31081 Register DestReg = MIIt->getOperand(0).getReg();
31082 Register Op1Reg = MIIt->getOperand(1).getReg();
31083 Register Op2Reg = MIIt->getOperand(2).getReg();
31084
31085 // If this CMOV we are generating is the opposite condition from
31086 // the jump we generated, then we have to swap the operands for the
31087 // PHI that is going to be generated.
31088 if (MIIt->getOperand(3).getImm() == OppCC)
31089 std::swap(Op1Reg, Op2Reg);
31090
31091 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
31092 Op1Reg = RegRewriteTable[Op1Reg].first;
31093
31094 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
31095 Op2Reg = RegRewriteTable[Op2Reg].second;
31096
31097 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
31098 .addReg(Op1Reg)
31099 .addMBB(FalseMBB)
31100 .addReg(Op2Reg)
31101 .addMBB(TrueMBB);
31102
31103 // Add this PHI to the rewrite table.
31104 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
31105 }
31106
31107 return MIB;
31108}
31109
31110// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
31111MachineBasicBlock *
31112X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
31113 MachineInstr &SecondCascadedCMOV,
31114 MachineBasicBlock *ThisMBB) const {
31115 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31116 DebugLoc DL = FirstCMOV.getDebugLoc();
31117
31118 // We lower cascaded CMOVs such as
31119 //
31120 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
31121 //
31122 // to two successive branches.
31123 //
31124 // Without this, we would add a PHI between the two jumps, which ends up
31125 // creating a few copies all around. For instance, for
31126 //
31127 // (sitofp (zext (fcmp une)))
31128 //
31129 // we would generate:
31130 //
31131 // ucomiss %xmm1, %xmm0
31132 // movss <1.0f>, %xmm0
31133 // movaps %xmm0, %xmm1
31134 // jne .LBB5_2
31135 // xorps %xmm1, %xmm1
31136 // .LBB5_2:
31137 // jp .LBB5_4
31138 // movaps %xmm1, %xmm0
31139 // .LBB5_4:
31140 // retq
31141 //
31142 // because this custom-inserter would have generated:
31143 //
31144 // A
31145 // | \
31146 // | B
31147 // | /
31148 // C
31149 // | \
31150 // | D
31151 // | /
31152 // E
31153 //
31154 // A: X = ...; Y = ...
31155 // B: empty
31156 // C: Z = PHI [X, A], [Y, B]
31157 // D: empty
31158 // E: PHI [X, C], [Z, D]
31159 //
31160 // If we lower both CMOVs in a single step, we can instead generate:
31161 //
31162 // A
31163 // | \
31164 // | C
31165 // | /|
31166 // |/ |
31167 // | |
31168 // | D
31169 // | /
31170 // E
31171 //
31172 // A: X = ...; Y = ...
31173 // D: empty
31174 // E: PHI [X, A], [X, C], [Y, D]
31175 //
31176 // Which, in our sitofp/fcmp example, gives us something like:
31177 //
31178 // ucomiss %xmm1, %xmm0
31179 // movss <1.0f>, %xmm0
31180 // jne .LBB5_4
31181 // jp .LBB5_4
31182 // xorps %xmm0, %xmm0
31183 // .LBB5_4:
31184 // retq
31185 //
31186
31187 // We lower cascaded CMOV into two successive branches to the same block.
31188 // EFLAGS is used by both, so mark it as live in the second.
31189 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
31190 MachineFunction *F = ThisMBB->getParent();
31191 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
31192 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
31193 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
31194
31195 MachineFunction::iterator It = ++ThisMBB->getIterator();
31196 F->insert(It, FirstInsertedMBB);
31197 F->insert(It, SecondInsertedMBB);
31198 F->insert(It, SinkMBB);
31199
31200 // For a cascaded CMOV, we lower it to two successive branches to
31201 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
31202 // the FirstInsertedMBB.
31203 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
31204
31205 // If the EFLAGS register isn't dead in the terminator, then claim that it's
31206 // live into the sink and copy blocks.
31207 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
31208 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
31209 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
31210 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
31211 SinkMBB->addLiveIn(X86::EFLAGS);
31212 }
31213
31214 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
31215 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
31216 std::next(MachineBasicBlock::iterator(FirstCMOV)),
31217 ThisMBB->end());
31218 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
31219
31220 // Fallthrough block for ThisMBB.
31221 ThisMBB->addSuccessor(FirstInsertedMBB);
31222 // The true block target of the first branch is always SinkMBB.
31223 ThisMBB->addSuccessor(SinkMBB);
31224 // Fallthrough block for FirstInsertedMBB.
31225 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
31226 // The true block for the branch of FirstInsertedMBB.
31227 FirstInsertedMBB->addSuccessor(SinkMBB);
31228 // This is fallthrough.
31229 SecondInsertedMBB->addSuccessor(SinkMBB);
31230
31231 // Create the conditional branch instructions.
31232 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
31233 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
31234
31235 X86::CondCode SecondCC =
31236 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
31237 BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);
31238
31239 // SinkMBB:
31240 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
31241 Register DestReg = FirstCMOV.getOperand(0).getReg();
31242 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
31243 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
31244 MachineInstrBuilder MIB =
31245 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
31246 .addReg(Op1Reg)
31247 .addMBB(SecondInsertedMBB)
31248 .addReg(Op2Reg)
31249 .addMBB(ThisMBB);
31250
31251 // The second SecondInsertedMBB provides the same incoming value as the
31252 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
31253 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
31254 // Copy the PHI result to the register defined by the second CMOV.
31255 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
31256 TII->get(TargetOpcode::COPY),
31257 SecondCascadedCMOV.getOperand(0).getReg())
31258 .addReg(FirstCMOV.getOperand(0).getReg());
31259
31260 // Now remove the CMOVs.
31261 FirstCMOV.eraseFromParent();
31262 SecondCascadedCMOV.eraseFromParent();
31263
31264 return SinkMBB;
31265}
31266
31267MachineBasicBlock *
31268X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
31269 MachineBasicBlock *ThisMBB) const {
31270 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31271 DebugLoc DL = MI.getDebugLoc();
31272
31273 // To "insert" a SELECT_CC instruction, we actually have to insert the
31274 // diamond control-flow pattern. The incoming instruction knows the
31275 // destination vreg to set, the condition code register to branch on, the
31276 // true/false values to select between and a branch opcode to use.
31277
31278 // ThisMBB:
31279 // ...
31280 // TrueVal = ...
31281 // cmpTY ccX, r1, r2
31282 // bCC copy1MBB
31283 // fallthrough --> FalseMBB
31284
31285 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
31286 // as described above, by inserting a BB, and then making a PHI at the join
31287 // point to select the true and false operands of the CMOV in the PHI.
31288 //
31289 // The code also handles two different cases of multiple CMOV opcodes
31290 // in a row.
31291 //
31292 // Case 1:
31293 // In this case, there are multiple CMOVs in a row, all which are based on
31294 // the same condition setting (or the exact opposite condition setting).
31295 // In this case we can lower all the CMOVs using a single inserted BB, and
31296 // then make a number of PHIs at the join point to model the CMOVs. The only
31297 // trickiness here, is that in a case like:
31298 //
31299 // t2 = CMOV cond1 t1, f1
31300 // t3 = CMOV cond1 t2, f2
31301 //
31302 // when rewriting this into PHIs, we have to perform some renaming on the
31303 // temps since you cannot have a PHI operand refer to a PHI result earlier
31304 // in the same block. The "simple" but wrong lowering would be:
31305 //
31306 // t2 = PHI t1(BB1), f1(BB2)
31307 // t3 = PHI t2(BB1), f2(BB2)
31308 //
31309 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
31310 // renaming is to note that on the path through BB1, t2 is really just a
31311 // copy of t1, and do that renaming, properly generating:
31312 //
31313 // t2 = PHI t1(BB1), f1(BB2)
31314 // t3 = PHI t1(BB1), f2(BB2)
31315 //
31316 // Case 2:
31317 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
31318 // function - EmitLoweredCascadedSelect.
31319
31320 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
31321 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
31322 MachineInstr *LastCMOV = &MI;
31323 MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);
31324
31325 // Check for case 1, where there are multiple CMOVs with the same condition
31326 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
31327 // number of jumps the most.
31328
31329 if (isCMOVPseudo(MI)) {
31330 // See if we have a string of CMOVS with the same condition. Skip over
31331 // intervening debug insts.
31332 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
31333 (NextMIIt->getOperand(3).getImm() == CC ||
31334 NextMIIt->getOperand(3).getImm() == OppCC)) {
31335 LastCMOV = &*NextMIIt;
31336 ++NextMIIt;
31337 NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
31338 }
31339 }
31340
31341 // This checks for case 2, but only do this if we didn't already find
31342 // case 1, as indicated by LastCMOV == MI.
31343 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
31344 NextMIIt->getOpcode() == MI.getOpcode() &&
31345 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
31346 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
31347 NextMIIt->getOperand(1).isKill()) {
31348 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
31349 }
31350
31351 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
31352 MachineFunction *F = ThisMBB->getParent();
31353 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
31354 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
31355
31356 MachineFunction::iterator It = ++ThisMBB->getIterator();
31357 F->insert(It, FalseMBB);
31358 F->insert(It, SinkMBB);
31359
31360 // If the EFLAGS register isn't dead in the terminator, then claim that it's
31361 // live into the sink and copy blocks.
31362 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
31363 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
31364 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
31365 FalseMBB->addLiveIn(X86::EFLAGS);
31366 SinkMBB->addLiveIn(X86::EFLAGS);
31367 }
31368
31369 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
31370 auto DbgEnd = MachineBasicBlock::iterator(LastCMOV);
31371 auto DbgIt = MachineBasicBlock::iterator(MI);
31372 while (DbgIt != DbgEnd) {
31373 auto Next = std::next(DbgIt);
31374 if (DbgIt->isDebugInstr())
31375 SinkMBB->push_back(DbgIt->removeFromParent());
31376 DbgIt = Next;
31377 }
31378
31379 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
31380 SinkMBB->splice(SinkMBB->end(), ThisMBB,
31381 std::next(MachineBasicBlock::iterator(LastCMOV)),
31382 ThisMBB->end());
31383 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
31384
31385 // Fallthrough block for ThisMBB.
31386 ThisMBB->addSuccessor(FalseMBB);
31387 // The true block target of the first (or only) branch is always a SinkMBB.
31388 ThisMBB->addSuccessor(SinkMBB);
31389 // Fallthrough block for FalseMBB.
31390 FalseMBB->addSuccessor(SinkMBB);
31391
31392 // Create the conditional branch instruction.
31393 BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
31394
31395 // SinkMBB:
31396 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
31397 // ...
31398 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
31399 MachineBasicBlock::iterator MIItEnd =
31400 std::next(MachineBasicBlock::iterator(LastCMOV));
31401 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
31402
31403 // Now remove the CMOV(s).
31404 ThisMBB->erase(MIItBegin, MIItEnd);
31405
31406 return SinkMBB;
31407}
31408
31409MachineBasicBlock *
31410X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
31411 MachineBasicBlock *BB) const {
31412 MachineFunction *MF = BB->getParent();
31413 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31414 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
31415 DebugLoc DL = MI.getDebugLoc();
31416 const BasicBlock *LLVM_BB = BB->getBasicBlock();
31417
31418 const unsigned ProbeSize = getStackProbeSize(*MF);
31419
31420 MachineRegisterInfo &MRI = MF->getRegInfo();
31421 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31422 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31423 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31424
31425 MachineFunction::iterator MBBIter = ++BB->getIterator();
31426 MF->insert(MBBIter, testMBB);
31427 MF->insert(MBBIter, blockMBB);
31428 MF->insert(MBBIter, tailMBB);
31429
31430 unsigned sizeVReg = MI.getOperand(1).getReg();
31431
31432 const TargetRegisterClass *SizeRegClass = MRI.getRegClass(sizeVReg);
31433
31434 unsigned tmpSizeVReg = MRI.createVirtualRegister(SizeRegClass);
31435 unsigned tmpSizeVReg2 = MRI.createVirtualRegister(SizeRegClass);
31436
31437 unsigned physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
31438
31439 // test rsp size
31440 BuildMI(testMBB, DL, TII->get(X86::PHI), tmpSizeVReg)
31441 .addReg(sizeVReg)
31442 .addMBB(BB)
31443 .addReg(tmpSizeVReg2)
31444 .addMBB(blockMBB);
31445
31446 BuildMI(testMBB, DL,
31447 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64ri32 : X86::CMP32ri))
31448 .addReg(tmpSizeVReg)
31449 .addImm(ProbeSize);
31450
31451 BuildMI(testMBB, DL, TII->get(X86::JCC_1))
31452 .addMBB(tailMBB)
31453 .addImm(X86::COND_L);
31454 testMBB->addSuccessor(blockMBB);
31455 testMBB->addSuccessor(tailMBB);
31456
31457 // allocate a block and touch it
31458
31459 BuildMI(blockMBB, DL,
31460 TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri),
31461 tmpSizeVReg2)
31462 .addReg(tmpSizeVReg)
31463 .addImm(ProbeSize);
31464
31465 BuildMI(blockMBB, DL,
31466 TII->get(TFI.Uses64BitFramePtr ? X86::SUB64ri32 : X86::SUB32ri),
31467 physSPReg)
31468 .addReg(physSPReg)
31469 .addImm(ProbeSize);
31470
31471 const unsigned MovMIOpc =
31472 TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;
31473 addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)
31474 .addImm(0);
31475
31476 BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
31477 blockMBB->addSuccessor(testMBB);
31478
31479 // allocate the tail and continue
31480 BuildMI(tailMBB, DL,
31481 TII->get(TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr),
31482 physSPReg)
31483 .addReg(physSPReg)
31484 .addReg(tmpSizeVReg);
31485 BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
31486 .addReg(physSPReg);
31487
31488 tailMBB->splice(tailMBB->end(), BB,
31489 std::next(MachineBasicBlock::iterator(MI)), BB->end());
31490 tailMBB->transferSuccessorsAndUpdatePHIs(BB);
31491 BB->addSuccessor(testMBB);
31492
31493 // Delete the original pseudo instruction.
31494 MI.eraseFromParent();
31495
31496 // And we're done.
31497 return tailMBB;
31498}
31499
31500MachineBasicBlock *
31501X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
31502 MachineBasicBlock *BB) const {
31503 MachineFunction *MF = BB->getParent();
31504 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31505 DebugLoc DL = MI.getDebugLoc();
31506 const BasicBlock *LLVM_BB = BB->getBasicBlock();
31507
31508 assert(MF->shouldSplitStack())((MF->shouldSplitStack()) ? static_cast<void> (0) : __assert_fail
("MF->shouldSplitStack()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31508, __PRETTY_FUNCTION__))
;
31509
31510 const bool Is64Bit = Subtarget.is64Bit();
31511 const bool IsLP64 = Subtarget.isTarget64BitLP64();
31512
31513 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
31514 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
31515
31516 // BB:
31517 // ... [Till the alloca]
31518 // If stacklet is not large enough, jump to mallocMBB
31519 //
31520 // bumpMBB:
31521 // Allocate by subtracting from RSP
31522 // Jump to continueMBB
31523 //
31524 // mallocMBB:
31525 // Allocate by call to runtime
31526 //
31527 // continueMBB:
31528 // ...
31529 // [rest of original BB]
31530 //
31531
31532 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31533 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31534 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
31535
31536 MachineRegisterInfo &MRI = MF->getRegInfo();
31537 const TargetRegisterClass *AddrRegClass =
31538 getRegClassFor(getPointerTy(MF->getDataLayout()));
31539
31540 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
31541 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
31542 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
31543 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
31544 sizeVReg = MI.getOperand(1).getReg(),
31545 physSPReg =
31546 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
31547
31548 MachineFunction::iterator MBBIter = ++BB->getIterator();
31549
31550 MF->insert(MBBIter, bumpMBB);
31551 MF->insert(MBBIter, mallocMBB);
31552 MF->insert(MBBIter, continueMBB);
31553
31554 continueMBB->splice(continueMBB->begin(), BB,
31555 std::next(MachineBasicBlock::iterator(MI)), BB->end());
31556 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
31557
31558 // Add code to the main basic block to check if the stack limit has been hit,
31559 // and if so, jump to mallocMBB otherwise to bumpMBB.
31560 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
31561 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
31562 .addReg(tmpSPVReg).addReg(sizeVReg);
31563 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
31564 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
31565 .addReg(SPLimitVReg);
31566 BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
31567
31568 // bumpMBB simply decreases the stack pointer, since we know the current
31569 // stacklet has enough space.
31570 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
31571 .addReg(SPLimitVReg);
31572 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
31573 .addReg(SPLimitVReg);
31574 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
31575
31576 // Calls into a routine in libgcc to allocate more space from the heap.
31577 const uint32_t *RegMask =
31578 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
31579 if (IsLP64) {
31580 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
31581 .addReg(sizeVReg);
31582 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
31583 .addExternalSymbol("__morestack_allocate_stack_space")
31584 .addRegMask(RegMask)
31585 .addReg(X86::RDI, RegState::Implicit)
31586 .addReg(X86::RAX, RegState::ImplicitDefine);
31587 } else if (Is64Bit) {
31588 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
31589 .addReg(sizeVReg);
31590 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
31591 .addExternalSymbol("__morestack_allocate_stack_space")
31592 .addRegMask(RegMask)
31593 .addReg(X86::EDI, RegState::Implicit)
31594 .addReg(X86::EAX, RegState::ImplicitDefine);
31595 } else {
31596 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
31597 .addImm(12);
31598 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
31599 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
31600 .addExternalSymbol("__morestack_allocate_stack_space")
31601 .addRegMask(RegMask)
31602 .addReg(X86::EAX, RegState::ImplicitDefine);
31603 }
31604
31605 if (!Is64Bit)
31606 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
31607 .addImm(16);
31608
31609 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
31610 .addReg(IsLP64 ? X86::RAX : X86::EAX);
31611 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
31612
31613 // Set up the CFG correctly.
31614 BB->addSuccessor(bumpMBB);
31615 BB->addSuccessor(mallocMBB);
31616 mallocMBB->addSuccessor(continueMBB);
31617 bumpMBB->addSuccessor(continueMBB);
31618
31619 // Take care of the PHI nodes.
31620 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
31621 MI.getOperand(0).getReg())
31622 .addReg(mallocPtrVReg)
31623 .addMBB(mallocMBB)
31624 .addReg(bumpSPPtrVReg)
31625 .addMBB(bumpMBB);
31626
31627 // Delete the original pseudo instruction.
31628 MI.eraseFromParent();
31629
31630 // And we're done.
31631 return continueMBB;
31632}
31633
31634MachineBasicBlock *
31635X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
31636 MachineBasicBlock *BB) const {
31637 MachineFunction *MF = BB->getParent();
31638 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
31639 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
31640 DebugLoc DL = MI.getDebugLoc();
31641
31642 assert(!isAsynchronousEHPersonality(((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31644, __PRETTY_FUNCTION__))
31643 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31644, __PRETTY_FUNCTION__))
31644 "SEH does not use catchret!")((!isAsynchronousEHPersonality( classifyEHPersonality(MF->
getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? static_cast<void> (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31644, __PRETTY_FUNCTION__))
;
31645
31646 // Only 32-bit EH needs to worry about manually restoring stack pointers.
31647 if (!Subtarget.is32Bit())
31648 return BB;
31649
31650 // C++ EH creates a new target block to hold the restore code, and wires up
31651 // the new block to the return destination with a normal JMP_4.
31652 MachineBasicBlock *RestoreMBB =
31653 MF->CreateMachineBasicBlock(BB->getBasicBlock());
31654 assert(BB->succ_size() == 1)((BB->succ_size() == 1) ? static_cast<void> (0) : __assert_fail
("BB->succ_size() == 1", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31654, __PRETTY_FUNCTION__))
;
31655 MF->insert(std::next(BB->getIterator()), RestoreMBB);
31656 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
31657 BB->addSuccessor(RestoreMBB);
31658 MI.getOperand(0).setMBB(RestoreMBB);
31659
31660 // Marking this as an EH pad but not a funclet entry block causes PEI to
31661 // restore stack pointers in the block.
31662 RestoreMBB->setIsEHPad(true);
31663
31664 auto RestoreMBBI = RestoreMBB->begin();
31665 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
31666 return BB;
31667}
31668
31669MachineBasicBlock *
31670X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
31671 MachineBasicBlock *BB) const {
31672 // So, here we replace TLSADDR with the sequence:
31673 // adjust_stackdown -> TLSADDR -> adjust_stackup.
31674 // We need this because TLSADDR is lowered into calls
31675 // inside MC, therefore without the two markers shrink-wrapping
31676 // may push the prologue/epilogue pass them.
31677 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
31678 DebugLoc DL = MI.getDebugLoc();
31679 MachineFunction &MF = *BB->getParent();
31680
31681 // Emit CALLSEQ_START right before the instruction.
31682 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
31683 MachineInstrBuilder CallseqStart =
31684 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
31685 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
31686
31687 // Emit CALLSEQ_END right after the instruction.
31688 // We don't call erase from parent because we want to keep the
31689 // original instruction around.
31690 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
31691 MachineInstrBuilder CallseqEnd =
31692 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
31693 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
31694
31695 return BB;
31696}
31697
31698MachineBasicBlock *
31699X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
31700 MachineBasicBlock *BB) const {
31701 // This is pretty easy. We're taking the value that we received from
31702 // our load from the relocation, sticking it in either RDI (x86-64)
31703 // or EAX and doing an indirect call. The return value will then
31704 // be in the normal return register.
31705 MachineFunction *F = BB->getParent();
31706 const X86InstrInfo *TII = Subtarget.getInstrInfo();
31707 DebugLoc DL = MI.getDebugLoc();
31708
31709 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")((Subtarget.isTargetDarwin() && "Darwin only instr emitted?"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31709, __PRETTY_FUNCTION__))
;
31710 assert(MI.getOperand(3).isGlobal() && "This should be a global")((MI.getOperand(3).isGlobal() && "This should be a global"
) ? static_cast<void> (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31710, __PRETTY_FUNCTION__))
;
31711
31712 // Get a register mask for the lowered call.
31713 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
31714 // proper register mask.
31715 const uint32_t *RegMask =
31716 Subtarget.is64Bit() ?
31717 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
31718 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
31719 if (Subtarget.is64Bit()) {
31720 MachineInstrBuilder MIB =
31721 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
31722 .addReg(X86::RIP)
31723 .addImm(0)
31724 .addReg(0)
31725 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
31726 MI.getOperand(3).getTargetFlags())
31727 .addReg(0);
31728 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
31729 addDirectMem(MIB, X86::RDI);
31730 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
31731 } else if (!isPositionIndependent()) {
31732 MachineInstrBuilder MIB =
31733 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
31734 .addReg(0)
31735 .addImm(0)
31736 .addReg(0)
31737 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
31738 MI.getOperand(3).getTargetFlags())
31739 .addReg(0);
31740 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
31741 addDirectMem(MIB, X86::EAX);
31742 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
31743 } else {
31744 MachineInstrBuilder MIB =
31745 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
31746 .addReg(TII->getGlobalBaseReg(F))
31747 .addImm(0)
31748 .addReg(0)
31749 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
31750 MI.getOperand(3).getTargetFlags())
31751 .addReg(0);
31752 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
31753 addDirectMem(MIB, X86::EAX);
31754 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
31755 }
31756
31757 MI.eraseFromParent(); // The pseudo instruction is gone now.
31758 return BB;
31759}
31760
31761static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
31762 switch (RPOpc) {
31763 case X86::RETPOLINE_CALL32:
31764 return X86::CALLpcrel32;
31765 case X86::RETPOLINE_CALL64:
31766 return X86::CALL64pcrel32;
31767 case X86::RETPOLINE_TCRETURN32:
31768 return X86::TCRETURNdi;
31769 case X86::RETPOLINE_TCRETURN64:
31770 return X86::TCRETURNdi64;
31771 }
31772 llvm_unreachable("not retpoline opcode")::llvm::llvm_unreachable_internal("not retpoline opcode", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31772)
;
31773}
31774
31775static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
31776 unsigned Reg) {
31777 if (Subtarget.useRetpolineExternalThunk()) {
31778 // When using an external thunk for retpolines, we pick names that match the
31779 // names GCC happens to use as well. This helps simplify the implementation
31780 // of the thunks for kernels where they have no easy ability to create
31781 // aliases and are doing non-trivial configuration of the thunk's body. For
31782 // example, the Linux kernel will do boot-time hot patching of the thunk
31783 // bodies and cannot easily export aliases of these to loaded modules.
31784 //
31785 // Note that at any point in the future, we may need to change the semantics
31786 // of how we implement retpolines and at that time will likely change the
31787 // name of the called thunk. Essentially, there is no hard guarantee that
31788 // LLVM will generate calls to specific thunks, we merely make a best-effort
31789 // attempt to help out kernels and other systems where duplicating the
31790 // thunks is costly.
31791 switch (Reg) {
31792 case X86::EAX:
31793 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31793, __PRETTY_FUNCTION__))
;
31794 return "__x86_indirect_thunk_eax";
31795 case X86::ECX:
31796 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31796, __PRETTY_FUNCTION__))
;
31797 return "__x86_indirect_thunk_ecx";
31798 case X86::EDX:
31799 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31799, __PRETTY_FUNCTION__))
;
31800 return "__x86_indirect_thunk_edx";
31801 case X86::EDI:
31802 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31802, __PRETTY_FUNCTION__))
;
31803 return "__x86_indirect_thunk_edi";
31804 case X86::R11:
31805 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31805, __PRETTY_FUNCTION__))
;
31806 return "__x86_indirect_thunk_r11";
31807 }
31808 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31808)
;
31809 }
31810
31811 // When targeting an internal COMDAT thunk use an LLVM-specific name.
31812 switch (Reg) {
31813 case X86::EAX:
31814 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31814, __PRETTY_FUNCTION__))
;
31815 return "__llvm_retpoline_eax";
31816 case X86::ECX:
31817 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31817, __PRETTY_FUNCTION__))
;
31818 return "__llvm_retpoline_ecx";
31819 case X86::EDX:
31820 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31820, __PRETTY_FUNCTION__))
;
31821 return "__llvm_retpoline_edx";
31822 case X86::EDI:
31823 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")((!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31823, __PRETTY_FUNCTION__))
;
31824 return "__llvm_retpoline_edi";
31825 case X86::R11:
31826 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")((Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31826, __PRETTY_FUNCTION__))
;
31827 return "__llvm_retpoline_r11";
31828 }
31829 llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31829)
;
31830}
31831
31832MachineBasicBlock *
31833X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
31834 MachineBasicBlock *BB) const {
31835 // Copy the virtual register into the R11 physical register and
31836 // call the retpoline thunk.
31837 DebugLoc DL = MI.getDebugLoc();
31838 const X86InstrInfo *TII = Subtarget.getInstrInfo();
31839 Register CalleeVReg = MI.getOperand(0).getReg();
31840 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
31841
31842 // Find an available scratch register to hold the callee. On 64-bit, we can
31843 // just use R11, but we scan for uses anyway to ensure we don't generate
31844 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
31845 // already a register use operand to the call to hold the callee. If none
31846 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
31847 // register and ESI is the base pointer to realigned stack frames with VLAs.
31848 SmallVector<unsigned, 3> AvailableRegs;
31849 if (Subtarget.is64Bit())
31850 AvailableRegs.push_back(X86::R11);
31851 else
31852 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
31853
31854 // Zero out any registers that are already used.
31855 for (const auto &MO : MI.operands()) {
31856 if (MO.isReg() && MO.isUse())
31857 for (unsigned &Reg : AvailableRegs)
31858 if (Reg == MO.getReg())
31859 Reg = 0;
31860 }
31861
31862 // Choose the first remaining non-zero available register.
31863 unsigned AvailableReg = 0;
31864 for (unsigned MaybeReg : AvailableRegs) {
31865 if (MaybeReg) {
31866 AvailableReg = MaybeReg;
31867 break;
31868 }
31869 }
31870 if (!AvailableReg)
31871 report_fatal_error("calling convention incompatible with retpoline, no "
31872 "available registers");
31873
31874 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
31875
31876 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
31877 .addReg(CalleeVReg);
31878 MI.getOperand(0).ChangeToES(Symbol);
31879 MI.setDesc(TII->get(Opc));
31880 MachineInstrBuilder(*BB->getParent(), &MI)
31881 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
31882 return BB;
31883}
31884
31885/// SetJmp implies future control flow change upon calling the corresponding
31886/// LongJmp.
31887/// Instead of using the 'return' instruction, the long jump fixes the stack and
31888/// performs an indirect branch. To do so it uses the registers that were stored
31889/// in the jump buffer (when calling SetJmp).
31890/// In case the shadow stack is enabled we need to fix it as well, because some
31891/// return addresses will be skipped.
31892/// The function will save the SSP for future fixing in the function
31893/// emitLongJmpShadowStackFix.
31894/// \sa emitLongJmpShadowStackFix
31895/// \param [in] MI The temporary Machine Instruction for the builtin.
31896/// \param [in] MBB The Machine Basic Block that will be modified.
31897void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
31898 MachineBasicBlock *MBB) const {
31899 DebugLoc DL = MI.getDebugLoc();
31900 MachineFunction *MF = MBB->getParent();
31901 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31902 MachineRegisterInfo &MRI = MF->getRegInfo();
31903 MachineInstrBuilder MIB;
31904
31905 // Memory Reference.
31906 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
31907 MI.memoperands_end());
31908
31909 // Initialize a register with zero.
31910 MVT PVT = getPointerTy(MF->getDataLayout());
31911 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
31912 Register ZReg = MRI.createVirtualRegister(PtrRC);
31913 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
31914 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
31915 .addDef(ZReg)
31916 .addReg(ZReg, RegState::Undef)
31917 .addReg(ZReg, RegState::Undef);
31918
31919 // Read the current SSP Register value to the zeroed register.
31920 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
31921 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
31922 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
31923
31924 // Write the SSP register value to offset 3 in input memory buffer.
31925 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
31926 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
31927 const int64_t SSPOffset = 3 * PVT.getStoreSize();
31928 const unsigned MemOpndSlot = 1;
31929 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
31930 if (i == X86::AddrDisp)
31931 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
31932 else
31933 MIB.add(MI.getOperand(MemOpndSlot + i));
31934 }
31935 MIB.addReg(SSPCopyReg);
31936 MIB.setMemRefs(MMOs);
31937}
31938
31939MachineBasicBlock *
31940X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
31941 MachineBasicBlock *MBB) const {
31942 DebugLoc DL = MI.getDebugLoc();
31943 MachineFunction *MF = MBB->getParent();
31944 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31945 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
31946 MachineRegisterInfo &MRI = MF->getRegInfo();
31947
31948 const BasicBlock *BB = MBB->getBasicBlock();
31949 MachineFunction::iterator I = ++MBB->getIterator();
31950
31951 // Memory Reference
31952 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
31953 MI.memoperands_end());
31954
31955 unsigned DstReg;
31956 unsigned MemOpndSlot = 0;
31957
31958 unsigned CurOp = 0;
31959
31960 DstReg = MI.getOperand(CurOp++).getReg();
31961 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
31962 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")((TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!"
) ? static_cast<void> (0) : __assert_fail ("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31962, __PRETTY_FUNCTION__))
;
31963 (void)TRI;
31964 Register mainDstReg = MRI.createVirtualRegister(RC);
31965 Register restoreDstReg = MRI.createVirtualRegister(RC);
31966
31967 MemOpndSlot = CurOp;
31968
31969 MVT PVT = getPointerTy(MF->getDataLayout());
31970 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31971, __PRETTY_FUNCTION__))
31971 "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31971, __PRETTY_FUNCTION__))
;
31972
31973 // For v = setjmp(buf), we generate
31974 //
31975 // thisMBB:
31976 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
31977 // SjLjSetup restoreMBB
31978 //
31979 // mainMBB:
31980 // v_main = 0
31981 //
31982 // sinkMBB:
31983 // v = phi(main, restore)
31984 //
31985 // restoreMBB:
31986 // if base pointer being used, load it from frame
31987 // v_restore = 1
31988
31989 MachineBasicBlock *thisMBB = MBB;
31990 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
31991 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
31992 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
31993 MF->insert(I, mainMBB);
31994 MF->insert(I, sinkMBB);
31995 MF->push_back(restoreMBB);
31996 restoreMBB->setHasAddressTaken();
31997
31998 MachineInstrBuilder MIB;
31999
32000 // Transfer the remainder of BB and its successor edges to sinkMBB.
32001 sinkMBB->splice(sinkMBB->begin(), MBB,
32002 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
32003 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32004
32005 // thisMBB:
32006 unsigned PtrStoreOpc = 0;
32007 unsigned LabelReg = 0;
32008 const int64_t LabelOffset = 1 * PVT.getStoreSize();
32009 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
32010 !isPositionIndependent();
32011
32012 // Prepare IP either in reg or imm.
32013 if (!UseImmLabel) {
32014 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
32015 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
32016 LabelReg = MRI.createVirtualRegister(PtrRC);
32017 if (Subtarget.is64Bit()) {
32018 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
32019 .addReg(X86::RIP)
32020 .addImm(0)
32021 .addReg(0)
32022 .addMBB(restoreMBB)
32023 .addReg(0);
32024 } else {
32025 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
32026 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
32027 .addReg(XII->getGlobalBaseReg(MF))
32028 .addImm(0)
32029 .addReg(0)
32030 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
32031 .addReg(0);
32032 }
32033 } else
32034 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
32035 // Store IP
32036 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
32037 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32038 if (i == X86::AddrDisp)
32039 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
32040 else
32041 MIB.add(MI.getOperand(MemOpndSlot + i));
32042 }
32043 if (!UseImmLabel)
32044 MIB.addReg(LabelReg);
32045 else
32046 MIB.addMBB(restoreMBB);
32047 MIB.setMemRefs(MMOs);
32048
32049 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
32050 emitSetJmpShadowStackFix(MI, thisMBB);
32051 }
32052
32053 // Setup
32054 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
32055 .addMBB(restoreMBB);
32056
32057 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
32058 MIB.addRegMask(RegInfo->getNoPreservedMask());
32059 thisMBB->addSuccessor(mainMBB);
32060 thisMBB->addSuccessor(restoreMBB);
32061
32062 // mainMBB:
32063 // EAX = 0
32064 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
32065 mainMBB->addSuccessor(sinkMBB);
32066
32067 // sinkMBB:
32068 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
32069 TII->get(X86::PHI), DstReg)
32070 .addReg(mainDstReg).addMBB(mainMBB)
32071 .addReg(restoreDstReg).addMBB(restoreMBB);
32072
32073 // restoreMBB:
32074 if (RegInfo->hasBasePointer(*MF)) {
32075 const bool Uses64BitFramePtr =
32076 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
32077 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
32078 X86FI->setRestoreBasePointer(MF);
32079 Register FramePtr = RegInfo->getFrameRegister(*MF);
32080 Register BasePtr = RegInfo->getBaseRegister();
32081 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
32082 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
32083 FramePtr, true, X86FI->getRestoreBasePointerOffset())
32084 .setMIFlag(MachineInstr::FrameSetup);
32085 }
32086 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
32087 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
32088 restoreMBB->addSuccessor(sinkMBB);
32089
32090 MI.eraseFromParent();
32091 return sinkMBB;
32092}
32093
32094/// Fix the shadow stack using the previously saved SSP pointer.
32095/// \sa emitSetJmpShadowStackFix
32096/// \param [in] MI The temporary Machine Instruction for the builtin.
32097/// \param [in] MBB The Machine Basic Block that will be modified.
32098/// \return The sink MBB that will perform the future indirect branch.
32099MachineBasicBlock *
32100X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
32101 MachineBasicBlock *MBB) const {
32102 DebugLoc DL = MI.getDebugLoc();
32103 MachineFunction *MF = MBB->getParent();
32104 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32105 MachineRegisterInfo &MRI = MF->getRegInfo();
32106
32107 // Memory Reference
32108 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
32109 MI.memoperands_end());
32110
32111 MVT PVT = getPointerTy(MF->getDataLayout());
32112 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
32113
32114 // checkSspMBB:
32115 // xor vreg1, vreg1
32116 // rdssp vreg1
32117 // test vreg1, vreg1
32118 // je sinkMBB # Jump if Shadow Stack is not supported
32119 // fallMBB:
32120 // mov buf+24/12(%rip), vreg2
32121 // sub vreg1, vreg2
32122 // jbe sinkMBB # No need to fix the Shadow Stack
32123 // fixShadowMBB:
32124 // shr 3/2, vreg2
32125 // incssp vreg2 # fix the SSP according to the lower 8 bits
32126 // shr 8, vreg2
32127 // je sinkMBB
32128 // fixShadowLoopPrepareMBB:
32129 // shl vreg2
32130 // mov 128, vreg3
32131 // fixShadowLoopMBB:
32132 // incssp vreg3
32133 // dec vreg2
32134 // jne fixShadowLoopMBB # Iterate until you finish fixing
32135 // # the Shadow Stack
32136 // sinkMBB:
32137
32138 MachineFunction::iterator I = ++MBB->getIterator();
32139 const BasicBlock *BB = MBB->getBasicBlock();
32140
32141 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
32142 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
32143 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
32144 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
32145 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
32146 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
32147 MF->insert(I, checkSspMBB);
32148 MF->insert(I, fallMBB);
32149 MF->insert(I, fixShadowMBB);
32150 MF->insert(I, fixShadowLoopPrepareMBB);
32151 MF->insert(I, fixShadowLoopMBB);
32152 MF->insert(I, sinkMBB);
32153
32154 // Transfer the remainder of BB and its successor edges to sinkMBB.
32155 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
32156 MBB->end());
32157 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
32158
32159 MBB->addSuccessor(checkSspMBB);
32160
32161 // Initialize a register with zero.
32162 Register ZReg = MRI.createVirtualRegister(PtrRC);
32163 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
32164 BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
32165 .addDef(ZReg)
32166 .addReg(ZReg, RegState::Undef)
32167 .addReg(ZReg, RegState::Undef);
32168
32169 // Read the current SSP Register value to the zeroed register.
32170 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
32171 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
32172 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
32173
32174 // Check whether the result of the SSP register is zero and jump directly
32175 // to the sink.
32176 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
32177 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
32178 .addReg(SSPCopyReg)
32179 .addReg(SSPCopyReg);
32180 BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
32181 checkSspMBB->addSuccessor(sinkMBB);
32182 checkSspMBB->addSuccessor(fallMBB);
32183
32184 // Reload the previously saved SSP register value.
32185 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
32186 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
32187 const int64_t SPPOffset = 3 * PVT.getStoreSize();
32188 MachineInstrBuilder MIB =
32189 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
32190 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32191 const MachineOperand &MO = MI.getOperand(i);
32192 if (i == X86::AddrDisp)
32193 MIB.addDisp(MO, SPPOffset);
32194 else if (MO.isReg()) // Don't add the whole operand, we don't want to
32195 // preserve kill flags.
32196 MIB.addReg(MO.getReg());
32197 else
32198 MIB.add(MO);
32199 }
32200 MIB.setMemRefs(MMOs);
32201
32202 // Subtract the current SSP from the previous SSP.
32203 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
32204 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
32205 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
32206 .addReg(PrevSSPReg)
32207 .addReg(SSPCopyReg);
32208
32209 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
32210 BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);
32211 fallMBB->addSuccessor(sinkMBB);
32212 fallMBB->addSuccessor(fixShadowMBB);
32213
32214 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
32215 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
32216 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
32217 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
32218 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
32219 .addReg(SspSubReg)
32220 .addImm(Offset);
32221
32222 // Increase SSP when looking only on the lower 8 bits of the delta.
32223 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
32224 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
32225
32226 // Reset the lower 8 bits.
32227 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
32228 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
32229 .addReg(SspFirstShrReg)
32230 .addImm(8);
32231
32232 // Jump if the result of the shift is zero.
32233 BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);
32234 fixShadowMBB->addSuccessor(sinkMBB);
32235 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
32236
32237 // Do a single shift left.
32238 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
32239 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
32240 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
32241 .addReg(SspSecondShrReg);
32242
32243 // Save the value 128 to a register (will be used next with incssp).
32244 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
32245 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
32246 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
32247 .addImm(128);
32248 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
32249
32250 // Since incssp only looks at the lower 8 bits, we might need to do several
32251 // iterations of incssp until we finish fixing the shadow stack.
32252 Register DecReg = MRI.createVirtualRegister(PtrRC);
32253 Register CounterReg = MRI.createVirtualRegister(PtrRC);
32254 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
32255 .addReg(SspAfterShlReg)
32256 .addMBB(fixShadowLoopPrepareMBB)
32257 .addReg(DecReg)
32258 .addMBB(fixShadowLoopMBB);
32259
32260 // Every iteration we increase the SSP by 128.
32261 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
32262
32263 // Every iteration we decrement the counter by 1.
32264 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
32265 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
32266
32267 // Jump if the counter is not zero yet.
32268 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);
32269 fixShadowLoopMBB->addSuccessor(sinkMBB);
32270 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
32271
32272 return sinkMBB;
32273}
32274
32275MachineBasicBlock *
32276X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
32277 MachineBasicBlock *MBB) const {
32278 DebugLoc DL = MI.getDebugLoc();
32279 MachineFunction *MF = MBB->getParent();
32280 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32281 MachineRegisterInfo &MRI = MF->getRegInfo();
32282
32283 // Memory Reference
32284 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
32285 MI.memoperands_end());
32286
32287 MVT PVT = getPointerTy(MF->getDataLayout());
32288 assert((PVT == MVT::i64 || PVT == MVT::i32) &&(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32289, __PRETTY_FUNCTION__))
32289 "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32289, __PRETTY_FUNCTION__))
;
32290
32291 const TargetRegisterClass *RC =
32292 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
32293 Register Tmp = MRI.createVirtualRegister(RC);
32294 // Since FP is only updated here but NOT referenced, it's treated as GPR.
32295 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
32296 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
32297 Register SP = RegInfo->getStackRegister();
32298
32299 MachineInstrBuilder MIB;
32300
32301 const int64_t LabelOffset = 1 * PVT.getStoreSize();
32302 const int64_t SPOffset = 2 * PVT.getStoreSize();
32303
32304 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
32305 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
32306
32307 MachineBasicBlock *thisMBB = MBB;
32308
32309 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
32310 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
32311 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
32312 }
32313
32314 // Reload FP
32315 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
32316 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32317 const MachineOperand &MO = MI.getOperand(i);
32318 if (MO.isReg()) // Don't add the whole operand, we don't want to
32319 // preserve kill flags.
32320 MIB.addReg(MO.getReg());
32321 else
32322 MIB.add(MO);
32323 }
32324 MIB.setMemRefs(MMOs);
32325
32326 // Reload IP
32327 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
32328 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32329 const MachineOperand &MO = MI.getOperand(i);
32330 if (i == X86::AddrDisp)
32331 MIB.addDisp(MO, LabelOffset);
32332 else if (MO.isReg()) // Don't add the whole operand, we don't want to
32333 // preserve kill flags.
32334 MIB.addReg(MO.getReg());
32335 else
32336 MIB.add(MO);
32337 }
32338 MIB.setMemRefs(MMOs);
32339
32340 // Reload SP
32341 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
32342 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
32343 if (i == X86::AddrDisp)
32344 MIB.addDisp(MI.getOperand(i), SPOffset);
32345 else
32346 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
32347 // the last instruction of the expansion.
32348 }
32349 MIB.setMemRefs(MMOs);
32350
32351 // Jump
32352 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
32353
32354 MI.eraseFromParent();
32355 return thisMBB;
32356}
32357
32358void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
32359 MachineBasicBlock *MBB,
32360 MachineBasicBlock *DispatchBB,
32361 int FI) const {
32362 DebugLoc DL = MI.getDebugLoc();
32363 MachineFunction *MF = MBB->getParent();
32364 MachineRegisterInfo *MRI = &MF->getRegInfo();
32365 const X86InstrInfo *TII = Subtarget.getInstrInfo();
32366
32367 MVT PVT = getPointerTy(MF->getDataLayout());
32368 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!"
) ? static_cast<void> (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32368, __PRETTY_FUNCTION__))
;
32369
32370 unsigned Op = 0;
32371 unsigned VR = 0;
32372
32373 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
32374 !isPositionIndependent();
32375
32376 if (UseImmLabel) {
32377 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
32378 } else {
32379 const TargetRegisterClass *TRC =
32380 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
32381 VR = MRI->createVirtualRegister(TRC);
32382 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
32383
32384 if (Subtarget.is64Bit())
32385 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
32386 .addReg(X86::RIP)
32387 .addImm(1)
32388 .addReg(0)
32389 .addMBB(DispatchBB)
32390 .addReg(0);
32391 else
32392 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
32393 .addReg(0) /* TII->getGlobalBaseReg(MF) */
32394 .addImm(1)
32395 .addReg(0)
32396 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
32397 .addReg(0);
32398 }
32399
32400 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
32401 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
32402 if (UseImmLabel)
32403 MIB.addMBB(DispatchBB);
32404 else
32405 MIB.addReg(VR);
32406}
32407
32408MachineBasicBlock *
32409X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
32410 MachineBasicBlock *BB) const {
32411 DebugLoc DL = MI.getDebugLoc();
32412 MachineFunction *MF = BB->getParent();
32413 MachineRegisterInfo *MRI = &MF->getRegInfo();
32414 const X86InstrInfo *TII = Subtarget.getInstrInfo();
32415 int FI = MF->getFrameInfo().getFunctionContextIndex();
32416
32417 // Get a mapping of the call site numbers to all of the landing pads they're
32418 // associated with.
32419 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
32420 unsigned MaxCSNum = 0;
32421 for (auto &MBB : *MF) {
32422 if (!MBB.isEHPad())
32423 continue;
32424
32425 MCSymbol *Sym = nullptr;
32426 for (const auto &MI : MBB) {
32427 if (MI.isDebugInstr())
32428 continue;
32429
32430 assert(MI.isEHLabel() && "expected EH_LABEL")((MI.isEHLabel() && "expected EH_LABEL") ? static_cast
<void> (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32430, __PRETTY_FUNCTION__))
;
32431 Sym = MI.getOperand(0).getMCSymbol();
32432 break;
32433 }
32434
32435 if (!MF->hasCallSiteLandingPad(Sym))
32436 continue;
32437
32438 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
32439 CallSiteNumToLPad[CSI].push_back(&MBB);
32440 MaxCSNum = std::max(MaxCSNum, CSI);
32441 }
32442 }
32443
32444 // Get an ordered list of the machine basic blocks for the jump table.
32445 std::vector<MachineBasicBlock *> LPadList;
32446 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
32447 LPadList.reserve(CallSiteNumToLPad.size());
32448
32449 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
32450 for (auto &LP : CallSiteNumToLPad[CSI]) {
32451 LPadList.push_back(LP);
32452 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
32453 }
32454 }
32455
32456 assert(!LPadList.empty() &&((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32457, __PRETTY_FUNCTION__))
32457 "No landing pad destinations for the dispatch jump table!")((!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? static_cast<void> (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32457, __PRETTY_FUNCTION__))
;
32458
32459 // Create the MBBs for the dispatch code.
32460
32461 // Shove the dispatch's address into the return slot in the function context.
32462 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
32463 DispatchBB->setIsEHPad(true);
32464
32465 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
32466 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
32467 DispatchBB->addSuccessor(TrapBB);
32468
32469 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
32470 DispatchBB->addSuccessor(DispContBB);
32471
32472 // Insert MBBs.
32473 MF->push_back(DispatchBB);
32474 MF->push_back(DispContBB);
32475 MF->push_back(TrapBB);
32476
32477 // Insert code into the entry block that creates and registers the function
32478 // context.
32479 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
32480
32481 // Create the jump table and associated information
32482 unsigned JTE = getJumpTableEncoding();
32483 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
32484 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
32485
32486 const X86RegisterInfo &RI = TII->getRegisterInfo();
32487 // Add a register mask with no preserved registers. This results in all
32488 // registers being marked as clobbered.
32489 if (RI.hasBasePointer(*MF)) {
32490 const bool FPIs64Bit =
32491 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
32492 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
32493 MFI->setRestoreBasePointer(MF);
32494
32495 Register FP = RI.getFrameRegister(*MF);
32496 Register BP = RI.getBaseRegister();
32497 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
32498 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
32499 MFI->getRestoreBasePointerOffset())
32500 .addRegMask(RI.getNoPreservedMask());
32501 } else {
32502 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
32503 .addRegMask(RI.getNoPreservedMask());
32504 }
32505
32506 // IReg is used as an index in a memory operand and therefore can't be SP
32507 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
32508 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
32509 Subtarget.is64Bit() ? 8 : 4);
32510 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
32511 .addReg(IReg)
32512 .addImm(LPadList.size());
32513 BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);
32514
32515 if (Subtarget.is64Bit()) {
32516 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
32517 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
32518
32519 // leaq .LJTI0_0(%rip), BReg
32520 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
32521 .addReg(X86::RIP)
32522 .addImm(1)
32523 .addReg(0)
32524 .addJumpTableIndex(MJTI)
32525 .addReg(0);
32526 // movzx IReg64, IReg
32527 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
32528 .addImm(0)
32529 .addReg(IReg)
32530 .addImm(X86::sub_32bit);
32531
32532 switch (JTE) {
32533 case MachineJumpTableInfo::EK_BlockAddress:
32534 // jmpq *(BReg,IReg64,8)
32535 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
32536 .addReg(BReg)
32537 .addImm(8)
32538 .addReg(IReg64)
32539 .addImm(0)
32540 .addReg(0);
32541 break;
32542 case MachineJumpTableInfo::EK_LabelDifference32: {
32543 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
32544 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
32545 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
32546
32547 // movl (BReg,IReg64,4), OReg
32548 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
32549 .addReg(BReg)
32550 .addImm(4)
32551 .addReg(IReg64)
32552 .addImm(0)
32553 .addReg(0);
32554 // movsx OReg64, OReg
32555 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
32556 // addq BReg, OReg64, TReg
32557 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
32558 .addReg(OReg64)
32559 .addReg(BReg);
32560 // jmpq *TReg
32561 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
32562 break;
32563 }
32564 default:
32565 llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32565)
;
32566 }
32567 } else {
32568 // jmpl *.LJTI0_0(,IReg,4)
32569 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
32570 .addReg(0)
32571 .addImm(4)
32572 .addReg(IReg)
32573 .addJumpTableIndex(MJTI)
32574 .addReg(0);
32575 }
32576
32577 // Add the jump table entries as successors to the MBB.
32578 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
32579 for (auto &LP : LPadList)
32580 if (SeenMBBs.insert(LP).second)
32581 DispContBB->addSuccessor(LP);
32582
32583 // N.B. the order the invoke BBs are processed in doesn't matter here.
32584 SmallVector<MachineBasicBlock *, 64> MBBLPads;
32585 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
32586 for (MachineBasicBlock *MBB : InvokeBBs) {
32587 // Remove the landing pad successor from the invoke block and replace it
32588 // with the new dispatch block.
32589 // Keep a copy of Successors since it's modified inside the loop.
32590 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
32591 MBB->succ_rend());
32592 // FIXME: Avoid quadratic complexity.
32593 for (auto MBBS : Successors) {
32594 if (MBBS->isEHPad()) {
32595 MBB->removeSuccessor(MBBS);
32596 MBBLPads.push_back(MBBS);
32597 }
32598 }
32599
32600 MBB->addSuccessor(DispatchBB);
32601
32602 // Find the invoke call and mark all of the callee-saved registers as
32603 // 'implicit defined' so that they're spilled. This prevents code from
32604 // moving instructions to before the EH block, where they will never be
32605 // executed.
32606 for (auto &II : reverse(*MBB)) {
32607 if (!II.isCall())
32608 continue;
32609
32610 DenseMap<unsigned, bool> DefRegs;
32611 for (auto &MOp : II.operands())
32612 if (MOp.isReg())
32613 DefRegs[MOp.getReg()] = true;
32614
32615 MachineInstrBuilder MIB(*MF, &II);
32616 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
32617 unsigned Reg = SavedRegs[RegIdx];
32618 if (!DefRegs[Reg])
32619 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
32620 }
32621
32622 break;
32623 }
32624 }
32625
32626 // Mark all former landing pads as non-landing pads. The dispatch is the only
32627 // landing pad now.
32628 for (auto &LP : MBBLPads)
32629 LP->setIsEHPad(false);
32630
32631 // The instruction is gone now.
32632 MI.eraseFromParent();
32633 return BB;
32634}
32635
32636MachineBasicBlock *
32637X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
32638 MachineBasicBlock *BB) const {
32639 MachineFunction *MF = BB->getParent();
32640 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
32641 DebugLoc DL = MI.getDebugLoc();
32642
32643 switch (MI.getOpcode()) {
32644 default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32644)
;
32645 case X86::TLS_addr32:
32646 case X86::TLS_addr64:
32647 case X86::TLS_base_addr32:
32648 case X86::TLS_base_addr64:
32649 return EmitLoweredTLSAddr(MI, BB);
32650 case X86::RETPOLINE_CALL32:
32651 case X86::RETPOLINE_CALL64:
32652 case X86::RETPOLINE_TCRETURN32:
32653 case X86::RETPOLINE_TCRETURN64:
32654 return EmitLoweredRetpoline(MI, BB);
32655 case X86::CATCHRET:
32656 return EmitLoweredCatchRet(MI, BB);
32657 case X86::SEG_ALLOCA_32:
32658 case X86::SEG_ALLOCA_64:
32659 return EmitLoweredSegAlloca(MI, BB);
32660 case X86::PROBED_ALLOCA_32:
32661 case X86::PROBED_ALLOCA_64:
32662 return EmitLoweredProbedAlloca(MI, BB);
32663 case X86::TLSCall_32:
32664 case X86::TLSCall_64:
32665 return EmitLoweredTLSCall(MI, BB);
32666 case X86::CMOV_FR32:
32667 case X86::CMOV_FR32X:
32668 case X86::CMOV_FR64:
32669 case X86::CMOV_FR64X:
32670 case X86::CMOV_GR8:
32671 case X86::CMOV_GR16:
32672 case X86::CMOV_GR32:
32673 case X86::CMOV_RFP32:
32674 case X86::CMOV_RFP64:
32675 case X86::CMOV_RFP80:
32676 case X86::CMOV_VR64:
32677 case X86::CMOV_VR128:
32678 case X86::CMOV_VR128X:
32679 case X86::CMOV_VR256:
32680 case X86::CMOV_VR256X:
32681 case X86::CMOV_VR512:
32682 case X86::CMOV_VK1:
32683 case X86::CMOV_VK2:
32684 case X86::CMOV_VK4:
32685 case X86::CMOV_VK8:
32686 case X86::CMOV_VK16:
32687 case X86::CMOV_VK32:
32688 case X86::CMOV_VK64:
32689 return EmitLoweredSelect(MI, BB);
32690
32691 case X86::RDFLAGS32:
32692 case X86::RDFLAGS64: {
32693 unsigned PushF =
32694 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
32695 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
32696 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
32697 // Permit reads of the EFLAGS and DF registers without them being defined.
32698 // This intrinsic exists to read external processor state in flags, such as
32699 // the trap flag, interrupt flag, and direction flag, none of which are
32700 // modeled by the backend.
32701 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&((Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32702, __PRETTY_FUNCTION__))
32702 "Unexpected register in operand!")((Push->getOperand(2).getReg() == X86::EFLAGS && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32702, __PRETTY_FUNCTION__))
;
32703 Push->getOperand(2).setIsUndef();
32704 assert(Push->getOperand(3).getReg() == X86::DF &&((Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32705, __PRETTY_FUNCTION__))
32705 "Unexpected register in operand!")((Push->getOperand(3).getReg() == X86::DF && "Unexpected register in operand!"
) ? static_cast<void> (0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32705, __PRETTY_FUNCTION__))
;
32706 Push->getOperand(3).setIsUndef();
32707 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
32708
32709 MI.eraseFromParent(); // The pseudo is gone now.
32710 return BB;
32711 }
32712
32713 case X86::WRFLAGS32:
32714 case X86::WRFLAGS64: {
32715 unsigned Push =
32716 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
32717 unsigned PopF =
32718 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
32719 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
32720 BuildMI(*BB, MI, DL, TII->get(PopF));
32721
32722 MI.eraseFromParent(); // The pseudo is gone now.
32723 return BB;
32724 }
32725
32726 case X86::FP32_TO_INT16_IN_MEM:
32727 case X86::FP32_TO_INT32_IN_MEM:
32728 case X86::FP32_TO_INT64_IN_MEM:
32729 case X86::FP64_TO_INT16_IN_MEM:
32730 case X86::FP64_TO_INT32_IN_MEM:
32731 case X86::FP64_TO_INT64_IN_MEM:
32732 case X86::FP80_TO_INT16_IN_MEM:
32733 case X86::FP80_TO_INT32_IN_MEM:
32734 case X86::FP80_TO_INT64_IN_MEM: {
32735 // Change the floating point control register to use "round towards zero"
32736 // mode when truncating to an integer value.
32737 int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
32738 addFrameReference(BuildMI(*BB, MI, DL,
32739 TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
32740
32741 // Load the old value of the control word...
32742 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
32743 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
32744 OrigCWFrameIdx);
32745
32746 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
32747 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
32748 BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
32749 .addReg(OldCW, RegState::Kill).addImm(0xC00);
32750
32751 // Extract to 16 bits.
32752 Register NewCW16 =
32753 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
32754 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
32755 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
32756
32757 // Prepare memory for FLDCW.
32758 int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
32759 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
32760 NewCWFrameIdx)
32761 .addReg(NewCW16, RegState::Kill);
32762
32763 // Reload the modified control word now...
32764 addFrameReference(BuildMI(*BB, MI, DL,
32765 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
32766
32767 // Get the X86 opcode to use.
32768 unsigned Opc;
32769 switch (MI.getOpcode()) {
32770 default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32770)
;
32771 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
32772 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
32773 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
32774 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
32775 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
32776 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
32777 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
32778 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
32779 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
32780 }
32781
32782 X86AddressMode AM = getAddressFromInstr(&MI, 0);
32783 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
32784 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
32785
32786 // Reload the original control word now.
32787 addFrameReference(BuildMI(*BB, MI, DL,
32788 TII->get(X86::FLDCW16m)), OrigCWFrameIdx);
32789
32790 MI.eraseFromParent(); // The pseudo instruction is gone now.
32791 return BB;
32792 }
32793
32794 // xbegin
32795 case X86::XBEGIN:
32796 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
32797
32798 case X86::VASTART_SAVE_XMM_REGS:
32799 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
32800
32801 case X86::VAARG_64:
32802 return EmitVAARG64WithCustomInserter(MI, BB);
32803
32804 case X86::EH_SjLj_SetJmp32:
32805 case X86::EH_SjLj_SetJmp64:
32806 return emitEHSjLjSetJmp(MI, BB);
32807
32808 case X86::EH_SjLj_LongJmp32:
32809 case X86::EH_SjLj_LongJmp64:
32810 return emitEHSjLjLongJmp(MI, BB);
32811
32812 case X86::Int_eh_sjlj_setup_dispatch:
32813 return EmitSjLjDispatchBlock(MI, BB);
32814
32815 case TargetOpcode::STATEPOINT:
32816 // As an implementation detail, STATEPOINT shares the STACKMAP format at
32817 // this point in the process. We diverge later.
32818 return emitPatchPoint(MI, BB);
32819
32820 case TargetOpcode::STACKMAP:
32821 case TargetOpcode::PATCHPOINT:
32822 return emitPatchPoint(MI, BB);
32823
32824 case TargetOpcode::PATCHABLE_EVENT_CALL:
32825 return emitXRayCustomEvent(MI, BB);
32826
32827 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
32828 return emitXRayTypedEvent(MI, BB);
32829
32830 case X86::LCMPXCHG8B: {
32831 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
32832 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
32833 // requires a memory operand. If it happens that current architecture is
32834 // i686 and for current function we need a base pointer
32835 // - which is ESI for i686 - register allocator would not be able to
32836 // allocate registers for an address in form of X(%reg, %reg, Y)
32837 // - there never would be enough unreserved registers during regalloc
32838 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
32839 // We are giving a hand to register allocator by precomputing the address in
32840 // a new vreg using LEA.
32841
32842 // If it is not i686 or there is no base pointer - nothing to do here.
32843 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
32844 return BB;
32845
32846 // Even though this code does not necessarily needs the base pointer to
32847 // be ESI, we check for that. The reason: if this assert fails, there are
32848 // some changes happened in the compiler base pointer handling, which most
32849 // probably have to be addressed somehow here.
32850 assert(TRI->getBaseRegister() == X86::ESI &&((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32852, __PRETTY_FUNCTION__))
32851 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32852, __PRETTY_FUNCTION__))
32852 "base pointer in mind")((TRI->getBaseRegister() == X86::ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? static_cast<void> (0) : __assert_fail
("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32852, __PRETTY_FUNCTION__))
;
32853
32854 MachineRegisterInfo &MRI = MF->getRegInfo();
32855 MVT SPTy = getPointerTy(MF->getDataLayout());
32856 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
32857 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
32858
32859 X86AddressMode AM = getAddressFromInstr(&MI, 0);
32860 // Regalloc does not need any help when the memory operand of CMPXCHG8B
32861 // does not use index register.
32862 if (AM.IndexReg == X86::NoRegister)
32863 return BB;
32864
32865 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
32866 // four operand definitions that are E[ABCD] registers. We skip them and
32867 // then insert the LEA.
32868 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
32869 while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||
32870 RMBBI->definesRegister(X86::EBX) ||
32871 RMBBI->definesRegister(X86::ECX) ||
32872 RMBBI->definesRegister(X86::EDX))) {
32873 ++RMBBI;
32874 }
32875 MachineBasicBlock::iterator MBBI(RMBBI);
32876 addFullAddress(
32877 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
32878
32879 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
32880
32881 return BB;
32882 }
32883 case X86::LCMPXCHG16B:
32884 return BB;
32885 case X86::LCMPXCHG8B_SAVE_EBX:
32886 case X86::LCMPXCHG16B_SAVE_RBX: {
32887 unsigned BasePtr =
32888 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
32889 if (!BB->isLiveIn(BasePtr))
32890 BB->addLiveIn(BasePtr);
32891 return BB;
32892 }
32893 }
32894}
32895
32896//===----------------------------------------------------------------------===//
32897// X86 Optimization Hooks
32898//===----------------------------------------------------------------------===//
32899
32900bool
32901X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
32902 const APInt &Demanded,
32903 TargetLoweringOpt &TLO) const {
32904 // Only optimize Ands to prevent shrinking a constant that could be
32905 // matched by movzx.
32906 if (Op.getOpcode() != ISD::AND)
32907 return false;
32908
32909 EVT VT = Op.getValueType();
32910
32911 // Ignore vectors.
32912 if (VT.isVector())
32913 return false;
32914
32915 unsigned Size = VT.getSizeInBits();
32916
32917 // Make sure the RHS really is a constant.
32918 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
32919 if (!C)
32920 return false;
32921
32922 const APInt &Mask = C->getAPIntValue();
32923
32924 // Clear all non-demanded bits initially.
32925 APInt ShrunkMask = Mask & Demanded;
32926
32927 // Find the width of the shrunk mask.
32928 unsigned Width = ShrunkMask.getActiveBits();
32929
32930 // If the mask is all 0s there's nothing to do here.
32931 if (Width == 0)
32932 return false;
32933
32934 // Find the next power of 2 width, rounding up to a byte.
32935 Width = PowerOf2Ceil(std::max(Width, 8U));
32936 // Truncate the width to size to handle illegal types.
32937 Width = std::min(Width, Size);
32938
32939 // Calculate a possible zero extend mask for this constant.
32940 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
32941
32942 // If we aren't changing the mask, just return true to keep it and prevent
32943 // the caller from optimizing.
32944 if (ZeroExtendMask == Mask)
32945 return true;
32946
32947 // Make sure the new mask can be represented by a combination of mask bits
32948 // and non-demanded bits.
32949 if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
32950 return false;
32951
32952 // Replace the constant with the zero extend mask.
32953 SDLoc DL(Op);
32954 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
32955 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
32956 return TLO.CombineTo(Op, NewOp);
32957}
32958
32959void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
32960 KnownBits &Known,
32961 const APInt &DemandedElts,
32962 const SelectionDAG &DAG,
32963 unsigned Depth) const {
32964 unsigned BitWidth = Known.getBitWidth();
32965 unsigned Opc = Op.getOpcode();
32966 EVT VT = Op.getValueType();
32967 assert((Opc >= ISD::BUILTIN_OP_END ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32972, __PRETTY_FUNCTION__))
32968 Opc == ISD::INTRINSIC_WO_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32972, __PRETTY_FUNCTION__))
32969 Opc == ISD::INTRINSIC_W_CHAIN ||(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32972, __PRETTY_FUNCTION__))
32970 Opc == ISD::INTRINSIC_VOID) &&(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32972, __PRETTY_FUNCTION__))
32971 "Should use MaskedValueIsZero if you don't know whether Op"(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32972, __PRETTY_FUNCTION__))
32972 " is a target node!")(((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN
|| Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID
) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? static_cast<void> (0) : __assert_fail
("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32972, __PRETTY_FUNCTION__))
;
32973
32974 Known.resetAll();
32975 switch (Opc) {
32976 default: break;
32977 case X86ISD::SETCC:
32978 Known.Zero.setBitsFrom(1);
32979 break;
32980 case X86ISD::MOVMSK: {
32981 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
32982 Known.Zero.setBitsFrom(NumLoBits);
32983 break;
32984 }
32985 case X86ISD::PEXTRB:
32986 case X86ISD::PEXTRW: {
32987 SDValue Src = Op.getOperand(0);
32988 EVT SrcVT = Src.getValueType();
32989 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
32990 Op.getConstantOperandVal(1));
32991 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
32992 Known = Known.anyextOrTrunc(BitWidth);
32993 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
32994 break;
32995 }
32996 case X86ISD::VSRAI:
32997 case X86ISD::VSHLI:
32998 case X86ISD::VSRLI: {
32999 unsigned ShAmt = Op.getConstantOperandVal(1);
33000 if (ShAmt >= VT.getScalarSizeInBits()) {
33001 Known.setAllZero();
33002 break;
33003 }
33004
33005 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
33006 if (Opc == X86ISD::VSHLI) {
33007 Known.Zero <<= ShAmt;
33008 Known.One <<= ShAmt;
33009 // Low bits are known zero.
33010 Known.Zero.setLowBits(ShAmt);
33011 } else if (Opc == X86ISD::VSRLI) {
33012 Known.Zero.lshrInPlace(ShAmt);
33013 Known.One.lshrInPlace(ShAmt);
33014 // High bits are known zero.
33015 Known.Zero.setHighBits(ShAmt);
33016 } else {
33017 Known.Zero.ashrInPlace(ShAmt);
33018 Known.One.ashrInPlace(ShAmt);
33019 }
33020 break;
33021 }
33022 case X86ISD::PACKUS: {
33023 // PACKUS is just a truncation if the upper half is zero.
33024 APInt DemandedLHS, DemandedRHS;
33025 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
33026
33027 Known.One = APInt::getAllOnesValue(BitWidth * 2);
33028 Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
33029
33030 KnownBits Known2;
33031 if (!!DemandedLHS) {
33032 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
33033 Known.One &= Known2.One;
33034 Known.Zero &= Known2.Zero;
33035 }
33036 if (!!DemandedRHS) {
33037 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
33038 Known.One &= Known2.One;
33039 Known.Zero &= Known2.Zero;
33040 }
33041
33042 if (Known.countMinLeadingZeros() < BitWidth)
33043 Known.resetAll();
33044 Known = Known.trunc(BitWidth);
33045 break;
33046 }
33047 case X86ISD::ANDNP: {
33048 KnownBits Known2;
33049 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
33050 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
33051
33052 // ANDNP = (~X & Y);
33053 Known.One &= Known2.Zero;
33054 Known.Zero |= Known2.One;
33055 break;
33056 }
33057 case X86ISD::FOR: {
33058 KnownBits Known2;
33059 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
33060 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
33061
33062 // Output known-0 bits are only known if clear in both the LHS & RHS.
33063 Known.Zero &= Known2.Zero;
33064 // Output known-1 are known to be set if set in either the LHS | RHS.
33065 Known.One |= Known2.One;
33066 break;
33067 }
33068 case X86ISD::PSADBW: {
33069 assert(VT.getScalarType() == MVT::i64 &&((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33071, __PRETTY_FUNCTION__))
33070 Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33071, __PRETTY_FUNCTION__))
33071 "Unexpected PSADBW types")((VT.getScalarType() == MVT::i64 && Op.getOperand(0).
getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? static_cast<void> (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33071, __PRETTY_FUNCTION__))
;
33072
33073 // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
33074 Known.Zero.setBitsFrom(16);
33075 break;
33076 }
33077 case X86ISD::CMOV: {
33078 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
33079 // If we don't know any bits, early out.
33080 if (Known.isUnknown())
33081 break;
33082 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
33083
33084 // Only known if known in both the LHS and RHS.
33085 Known.One &= Known2.One;
33086 Known.Zero &= Known2.Zero;
33087 break;
33088 }
33089 case X86ISD::BEXTR: {
33090 SDValue Op0 = Op.getOperand(0);
33091 SDValue Op1 = Op.getOperand(1);
33092
33093 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
33094 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
33095 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
33096
33097 // If the length is 0, the result is 0.
33098 if (Length == 0) {
33099 Known.setAllZero();
33100 break;
33101 }
33102
33103 if ((Shift + Length) <= BitWidth) {
33104 Known = DAG.computeKnownBits(Op0, Depth + 1);
33105 Known = Known.extractBits(Length, Shift);
33106 Known = Known.zextOrTrunc(BitWidth);
33107 }
33108 }
33109 break;
33110 }
33111 }
33112
33113 // Handle target shuffles.
33114 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
33115 if (isTargetShuffle(Opc)) {
33116 bool IsUnary;
33117 SmallVector<int, 64> Mask;
33118 SmallVector<SDValue, 2> Ops;
33119 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
33120 IsUnary)) {
33121 unsigned NumOps = Ops.size();
33122 unsigned NumElts = VT.getVectorNumElements();
33123 if (Mask.size() == NumElts) {
33124 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
33125 Known.Zero.setAllBits(); Known.One.setAllBits();
33126 for (unsigned i = 0; i != NumElts; ++i) {
33127 if (!DemandedElts[i])
33128 continue;
33129 int M = Mask[i];
33130 if (M == SM_SentinelUndef) {
33131 // For UNDEF elements, we don't know anything about the common state
33132 // of the shuffle result.
33133 Known.resetAll();
33134 break;
33135 } else if (M == SM_SentinelZero) {
33136 Known.One.clearAllBits();
33137 continue;
33138 }
33139 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33140, __PRETTY_FUNCTION__))
33140 "Shuffle index out of range")((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33140, __PRETTY_FUNCTION__))
;
33141
33142 unsigned OpIdx = (unsigned)M / NumElts;
33143 unsigned EltIdx = (unsigned)M % NumElts;
33144 if (Ops[OpIdx].getValueType() != VT) {
33145 // TODO - handle target shuffle ops with different value types.
33146 Known.resetAll();
33147 break;
33148 }
33149 DemandedOps[OpIdx].setBit(EltIdx);
33150 }
33151 // Known bits are the values that are shared by every demanded element.
33152 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
33153 if (!DemandedOps[i])
33154 continue;
33155 KnownBits Known2 =
33156 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
33157 Known.One &= Known2.One;
33158 Known.Zero &= Known2.Zero;
33159 }
33160 }
33161 }
33162 }
33163}
33164
33165unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
33166 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
33167 unsigned Depth) const {
33168 EVT VT = Op.getValueType();
33169 unsigned VTBits = VT.getScalarSizeInBits();
33170 unsigned Opcode = Op.getOpcode();
33171 switch (Opcode) {
33172 case X86ISD::SETCC_CARRY:
33173 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
33174 return VTBits;
33175
33176 case X86ISD::VTRUNC: {
33177 // TODO: Add DemandedElts support.
33178 SDValue Src = Op.getOperand(0);
33179 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
33180 assert(VTBits < NumSrcBits && "Illegal truncation input type")((VTBits < NumSrcBits && "Illegal truncation input type"
) ? static_cast<void> (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33180, __PRETTY_FUNCTION__))
;
33181 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
33182 if (Tmp > (NumSrcBits - VTBits))
33183 return Tmp - (NumSrcBits - VTBits);
33184 return 1;
33185 }
33186
33187 case X86ISD::PACKSS: {
33188 // PACKSS is just a truncation if the sign bits extend to the packed size.
33189 APInt DemandedLHS, DemandedRHS;
33190 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
33191 DemandedRHS);
33192
33193 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
33194 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
33195 if (!!DemandedLHS)
33196 Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
33197 if (!!DemandedRHS)
33198 Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
33199 unsigned Tmp = std::min(Tmp0, Tmp1);
33200 if (Tmp > (SrcBits - VTBits))
33201 return Tmp - (SrcBits - VTBits);
33202 return 1;
33203 }
33204
33205 case X86ISD::VSHLI: {
33206 SDValue Src = Op.getOperand(0);
33207 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
33208 if (ShiftVal.uge(VTBits))
33209 return VTBits; // Shifted all bits out --> zero.
33210 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
33211 if (ShiftVal.uge(Tmp))
33212 return 1; // Shifted all sign bits out --> unknown.
33213 return Tmp - ShiftVal.getZExtValue();
33214 }
33215
33216 case X86ISD::VSRAI: {
33217 SDValue Src = Op.getOperand(0);
33218 APInt ShiftVal = Op.getConstantOperandAPInt(1);
33219 if (ShiftVal.uge(VTBits - 1))
33220 return VTBits; // Sign splat.
33221 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
33222 ShiftVal += Tmp;
33223 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
33224 }
33225
33226 case X86ISD::PCMPGT:
33227 case X86ISD::PCMPEQ:
33228 case X86ISD::CMPP:
33229 case X86ISD::VPCOM:
33230 case X86ISD::VPCOMU:
33231 // Vector compares return zero/all-bits result values.
33232 return VTBits;
33233
33234 case X86ISD::ANDNP: {
33235 unsigned Tmp0 =
33236 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
33237 if (Tmp0 == 1) return 1; // Early out.
33238 unsigned Tmp1 =
33239 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
33240 return std::min(Tmp0, Tmp1);
33241 }
33242
33243 case X86ISD::CMOV: {
33244 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
33245 if (Tmp0 == 1) return 1; // Early out.
33246 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
33247 return std::min(Tmp0, Tmp1);
33248 }
33249 }
33250
33251 // Handle target shuffles.
33252 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
33253 if (isTargetShuffle(Opcode)) {
33254 bool IsUnary;
33255 SmallVector<int, 64> Mask;
33256 SmallVector<SDValue, 2> Ops;
33257 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
33258 IsUnary)) {
33259 unsigned NumOps = Ops.size();
33260 unsigned NumElts = VT.getVectorNumElements();
33261 if (Mask.size() == NumElts) {
33262 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
33263 for (unsigned i = 0; i != NumElts; ++i) {
33264 if (!DemandedElts[i])
33265 continue;
33266 int M = Mask[i];
33267 if (M == SM_SentinelUndef) {
33268 // For UNDEF elements, we don't know anything about the common state
33269 // of the shuffle result.
33270 return 1;
33271 } else if (M == SM_SentinelZero) {
33272 // Zero = all sign bits.
33273 continue;
33274 }
33275 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33276, __PRETTY_FUNCTION__))
33276 "Shuffle index out of range")((0 <= M && (unsigned)M < (NumOps * NumElts) &&
"Shuffle index out of range") ? static_cast<void> (0) :
__assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33276, __PRETTY_FUNCTION__))
;
33277
33278 unsigned OpIdx = (unsigned)M / NumElts;
33279 unsigned EltIdx = (unsigned)M % NumElts;
33280 if (Ops[OpIdx].getValueType() != VT) {
33281 // TODO - handle target shuffle ops with different value types.
33282 return 1;
33283 }
33284 DemandedOps[OpIdx].setBit(EltIdx);
33285 }
33286 unsigned Tmp0 = VTBits;
33287 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
33288 if (!DemandedOps[i])
33289 continue;
33290 unsigned Tmp1 =
33291 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
33292 Tmp0 = std::min(Tmp0, Tmp1);
33293 }
33294 return Tmp0;
33295 }
33296 }
33297 }
33298
33299 // Fallback case.
33300 return 1;
33301}
33302
33303SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
33304 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
33305 return N->getOperand(0);
33306 return N;
33307}
33308
33309// Attempt to match a combined shuffle mask against supported unary shuffle
33310// instructions.
33311// TODO: Investigate sharing more of this with shuffle lowering.
33312static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
33313 bool AllowFloatDomain, bool AllowIntDomain,
33314 SDValue &V1, const SDLoc &DL, SelectionDAG &DAG,
33315 const X86Subtarget &Subtarget, unsigned &Shuffle,
33316 MVT &SrcVT, MVT &DstVT) {
33317 unsigned NumMaskElts = Mask.size();
33318 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
33319
33320 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
33321 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
33322 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
33323 Shuffle = X86ISD::VZEXT_MOVL;
33324 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
33325 return true;
33326 }
33327
33328 // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.
33329 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
33330 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
33331 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
33332 unsigned MaxScale = 64 / MaskEltSize;
33333 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
33334 bool MatchAny = true;
33335 bool MatchZero = true;
33336 unsigned NumDstElts = NumMaskElts / Scale;
33337 for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {
33338 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
33339 MatchAny = MatchZero = false;
33340 break;
33341 }
33342 MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);
33343 MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
33344 }
33345 if (MatchAny || MatchZero) {
33346 assert(MatchZero && "Failed to match zext but matched aext?")((MatchZero && "Failed to match zext but matched aext?"
) ? static_cast<void> (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33346, __PRETTY_FUNCTION__))
;
33347 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
33348 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
33349 MVT::getIntegerVT(MaskEltSize);
33350 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
33351
33352 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
33353 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
33354
33355 Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);
33356 if (SrcVT.getVectorNumElements() != NumDstElts)
33357 Shuffle = getOpcode_EXTEND_VECTOR_INREG(Shuffle);
33358
33359 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
33360 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
33361 return true;
33362 }
33363 }
33364 }
33365
33366 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
33367 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
33368 isUndefOrEqual(Mask[0], 0) &&
33369 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
33370 Shuffle = X86ISD::VZEXT_MOVL;
33371 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
33372 return true;
33373 }
33374
33375 // Check if we have SSE3 which will let us use MOVDDUP etc. The
33376 // instructions are no slower than UNPCKLPD but has the option to
33377 // fold the input operand into even an unaligned memory load.
33378 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
33379 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
33380 Shuffle = X86ISD::MOVDDUP;
33381 SrcVT = DstVT = MVT::v2f64;
33382 return true;
33383 }
33384 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
33385 Shuffle = X86ISD::MOVSLDUP;
33386 SrcVT = DstVT = MVT::v4f32;
33387 return true;
33388 }
33389 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
33390 Shuffle = X86ISD::MOVSHDUP;
33391 SrcVT = DstVT = MVT::v4f32;
33392 return true;
33393 }
33394 }
33395
33396 if (MaskVT.is256BitVector() && AllowFloatDomain) {
33397 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")((Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33397, __PRETTY_FUNCTION__))
;
33398 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
33399 Shuffle = X86ISD::MOVDDUP;
33400 SrcVT = DstVT = MVT::v4f64;
33401 return true;
33402 }
33403 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
33404 Shuffle = X86ISD::MOVSLDUP;
33405 SrcVT = DstVT = MVT::v8f32;
33406 return true;
33407 }
33408 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
33409 Shuffle = X86ISD::MOVSHDUP;
33410 SrcVT = DstVT = MVT::v8f32;
33411 return true;
33412 }
33413 }
33414
33415 if (MaskVT.is512BitVector() && AllowFloatDomain) {
33416 assert(Subtarget.hasAVX512() &&((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33417, __PRETTY_FUNCTION__))
33417 "AVX512 required for 512-bit vector shuffles")((Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33417, __PRETTY_FUNCTION__))
;
33418 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
33419 Shuffle = X86ISD::MOVDDUP;
33420 SrcVT = DstVT = MVT::v8f64;
33421 return true;
33422 }
33423 if (isTargetShuffleEquivalent(
33424 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
33425 Shuffle = X86ISD::MOVSLDUP;
33426 SrcVT = DstVT = MVT::v16f32;
33427 return true;
33428 }
33429 if (isTargetShuffleEquivalent(
33430 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
33431 Shuffle = X86ISD::MOVSHDUP;
33432 SrcVT = DstVT = MVT::v16f32;
33433 return true;
33434 }
33435 }
33436
33437 return false;
33438}
33439
33440// Attempt to match a combined shuffle mask against supported unary immediate
33441// permute instructions.
33442// TODO: Investigate sharing more of this with shuffle lowering.
33443static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
33444 const APInt &Zeroable,
33445 bool AllowFloatDomain, bool AllowIntDomain,
33446 const X86Subtarget &Subtarget,
33447 unsigned &Shuffle, MVT &ShuffleVT,
33448 unsigned &PermuteImm) {
33449 unsigned NumMaskElts = Mask.size();
33450 unsigned InputSizeInBits = MaskVT.getSizeInBits();
33451 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
33452 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
33453
33454 bool ContainsZeros =
33455 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
33456
33457 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
33458 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
33459 // Check for lane crossing permutes.
33460 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
33461 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
33462 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
33463 Shuffle = X86ISD::VPERMI;
33464 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
33465 PermuteImm = getV4X86ShuffleImm(Mask);
33466 return true;
33467 }
33468 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
33469 SmallVector<int, 4> RepeatedMask;
33470 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
33471 Shuffle = X86ISD::VPERMI;
33472 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
33473 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
33474 return true;
33475 }
33476 }
33477 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
33478 // VPERMILPD can permute with a non-repeating shuffle.
33479 Shuffle = X86ISD::VPERMILPI;
33480 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
33481 PermuteImm = 0;
33482 for (int i = 0, e = Mask.size(); i != e; ++i) {
33483 int M = Mask[i];
33484 if (M == SM_SentinelUndef)
33485 continue;
33486 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")((((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? static_cast<void> (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33486, __PRETTY_FUNCTION__))
;
33487 PermuteImm |= (M & 1) << i;
33488 }
33489 return true;
33490 }
33491 }
33492
33493 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
33494 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
33495 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
33496 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
33497 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
33498 SmallVector<int, 4> RepeatedMask;
33499 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
33500 // Narrow the repeated mask to create 32-bit element permutes.
33501 SmallVector<int, 4> WordMask = RepeatedMask;
33502 if (MaskScalarSizeInBits == 64)
33503 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
33504
33505 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
33506 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
33507 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
33508 PermuteImm = getV4X86ShuffleImm(WordMask);
33509 return true;
33510 }
33511 }
33512
33513 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
33514 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
33515 SmallVector<int, 4> RepeatedMask;
33516 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
33517 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
33518 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
33519
33520 // PSHUFLW: permute lower 4 elements only.
33521 if (isUndefOrInRange(LoMask, 0, 4) &&
33522 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
33523 Shuffle = X86ISD::PSHUFLW;
33524 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
33525 PermuteImm = getV4X86ShuffleImm(LoMask);
33526 return true;
33527 }
33528
33529 // PSHUFHW: permute upper 4 elements only.
33530 if (isUndefOrInRange(HiMask, 4, 8) &&
33531 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
33532 // Offset the HiMask so that we can create the shuffle immediate.
33533 int OffsetHiMask[4];
33534 for (int i = 0; i != 4; ++i)
33535 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
33536
33537 Shuffle = X86ISD::PSHUFHW;
33538 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
33539 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
33540 return true;
33541 }
33542 }
33543 }
33544
33545 // Attempt to match against byte/bit shifts.
33546 if (AllowIntDomain &&
33547 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
33548 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
33549 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
33550 int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
33551 Mask, 0, Zeroable, Subtarget);
33552 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
33553 32 <= ShuffleVT.getScalarSizeInBits())) {
33554 PermuteImm = (unsigned)ShiftAmt;
33555 return true;
33556 }
33557 }
33558
33559 // Attempt to match against bit rotates.
33560 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
33561 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
33562 Subtarget.hasAVX512())) {
33563 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
33564 Subtarget, Mask);
33565 if (0 < RotateAmt) {
33566 Shuffle = X86ISD::VROTLI;
33567 PermuteImm = (unsigned)RotateAmt;
33568 return true;
33569 }
33570 }
33571
33572 return false;
33573}
33574
33575// Attempt to match a combined unary shuffle mask against supported binary
33576// shuffle instructions.
33577// TODO: Investigate sharing more of this with shuffle lowering.
33578static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
33579 bool AllowFloatDomain, bool AllowIntDomain,
33580 SDValue &V1, SDValue &V2, const SDLoc &DL,
33581 SelectionDAG &DAG, const X86Subtarget &Subtarget,
33582 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
33583 bool IsUnary) {
33584 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
33585
33586 if (MaskVT.is128BitVector()) {
33587 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
33588 V2 = V1;
33589 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
33590 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
33591 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
33592 return true;
33593 }
33594 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
33595 V2 = V1;
33596 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
33597 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
33598 return true;
33599 }
33600 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
33601 (AllowFloatDomain || !Subtarget.hasSSE41())) {
33602 std::swap(V1, V2);
33603 Shuffle = X86ISD::MOVSD;
33604 SrcVT = DstVT = MVT::v2f64;
33605 return true;
33606 }
33607 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
33608 (AllowFloatDomain || !Subtarget.hasSSE41())) {
33609 Shuffle = X86ISD::MOVSS;
33610 SrcVT = DstVT = MVT::v4f32;
33611 return true;
33612 }
33613 }
33614
33615 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
33616 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
33617 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
33618 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
33619 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
33620 Subtarget)) {
33621 DstVT = MaskVT;
33622 return true;
33623 }
33624 }
33625
33626 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
33627 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
33628 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
33629 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
33630 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
33631 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
33632 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
33633 Subtarget)) {
33634 SrcVT = DstVT = MaskVT;
33635 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
33636 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
33637 return true;
33638 }
33639 }
33640
33641 return false;
33642}
33643
33644static bool matchBinaryPermuteShuffle(
33645 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
33646 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
33647 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
33648 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
33649 unsigned NumMaskElts = Mask.size();
33650 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
33651
33652 // Attempt to match against PALIGNR byte rotate.
33653 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
33654 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
33655 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
33656 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
33657 if (0 < ByteRotation) {
33658 Shuffle = X86ISD::PALIGNR;
33659 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
33660 PermuteImm = ByteRotation;
33661 return true;
33662 }
33663 }
33664
33665 // Attempt to combine to X86ISD::BLENDI.
33666 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
33667 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
33668 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
33669 uint64_t BlendMask = 0;
33670 bool ForceV1Zero = false, ForceV2Zero = false;
33671 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
33672 if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
33673 ForceV2Zero, BlendMask)) {
33674 if (MaskVT == MVT::v16i16) {
33675 // We can only use v16i16 PBLENDW if the lanes are repeated.
33676 SmallVector<int, 8> RepeatedMask;
33677 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
33678 RepeatedMask)) {
33679 assert(RepeatedMask.size() == 8 &&((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33680, __PRETTY_FUNCTION__))
33680 "Repeated mask size doesn't match!")((RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"
) ? static_cast<void> (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33680, __PRETTY_FUNCTION__))
;
33681 PermuteImm = 0;
33682 for (int i = 0; i < 8; ++i)
33683 if (RepeatedMask[i] >= 8)
33684 PermuteImm |= 1 << i;
33685 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
33686 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
33687 Shuffle = X86ISD::BLENDI;
33688 ShuffleVT = MaskVT;
33689 return true;
33690 }
33691 } else {
33692 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
33693 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
33694 PermuteImm = (unsigned)BlendMask;
33695 Shuffle = X86ISD::BLENDI;
33696 ShuffleVT = MaskVT;
33697 return true;
33698 }
33699 }
33700 }
33701
33702 // Attempt to combine to INSERTPS, but only if it has elements that need to
33703 // be set to zero.
33704 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
33705 MaskVT.is128BitVector() &&
33706 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&
33707 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
33708 Shuffle = X86ISD::INSERTPS;
33709 ShuffleVT = MVT::v4f32;
33710 return true;
33711 }
33712
33713 // Attempt to combine to SHUFPD.
33714 if (AllowFloatDomain && EltSizeInBits == 64 &&
33715 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
33716 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
33717 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
33718 bool ForceV1Zero = false, ForceV2Zero = false;
33719 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
33720 PermuteImm, Mask, Zeroable)) {
33721 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
33722 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
33723 Shuffle = X86ISD::SHUFP;
33724 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
33725 return true;
33726 }
33727 }
33728
33729 // Attempt to combine to SHUFPS.
33730 if (AllowFloatDomain && EltSizeInBits == 32 &&
33731 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
33732 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
33733 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
33734 SmallVector<int, 4> RepeatedMask;
33735 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
33736 // Match each half of the repeated mask, to determine if its just
33737 // referencing one of the vectors, is zeroable or entirely undef.
33738 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
33739 int M0 = RepeatedMask[Offset];
33740 int M1 = RepeatedMask[Offset + 1];
33741
33742 if (isUndefInRange(RepeatedMask, Offset, 2)) {
33743 return DAG.getUNDEF(MaskVT);
33744 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
33745 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
33746 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
33747 return getZeroVector(MaskVT, Subtarget, DAG, DL);
33748 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
33749 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
33750 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
33751 return V1;
33752 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
33753 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
33754 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
33755 return V2;
33756 }
33757
33758 return SDValue();
33759 };
33760
33761 int ShufMask[4] = {-1, -1, -1, -1};
33762 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
33763 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
33764
33765 if (Lo && Hi) {
33766 V1 = Lo;
33767 V2 = Hi;
33768 Shuffle = X86ISD::SHUFP;
33769 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
33770 PermuteImm = getV4X86ShuffleImm(ShufMask);
33771 return true;
33772 }
33773 }
33774 }
33775
33776 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
33777 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
33778 MaskVT.is128BitVector() &&
33779 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
33780 Shuffle = X86ISD::INSERTPS;
33781 ShuffleVT = MVT::v4f32;
33782 return true;
33783 }
33784
33785 return false;
33786}
33787
33788static SDValue combineX86ShuffleChainWithExtract(
33789 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
33790 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
33791 const X86Subtarget &Subtarget);
33792
33793/// Combine an arbitrary chain of shuffles into a single instruction if
33794/// possible.
33795///
33796/// This is the leaf of the recursive combine below. When we have found some
33797/// chain of single-use x86 shuffle instructions and accumulated the combined
33798/// shuffle mask represented by them, this will try to pattern match that mask
33799/// into either a single instruction if there is a special purpose instruction
33800/// for this operation, or into a PSHUFB instruction which is a fully general
33801/// instruction but should only be used to replace chains over a certain depth.
33802static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
33803 ArrayRef<int> BaseMask, int Depth,
33804 bool HasVariableMask,
33805 bool AllowVariableMask, SelectionDAG &DAG,
33806 const X86Subtarget &Subtarget) {
33807 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")((!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? static_cast<void> (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33807, __PRETTY_FUNCTION__))
;
33808 assert((Inputs.size() == 1 || Inputs.size() == 2) &&(((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"
) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33809, __PRETTY_FUNCTION__))
33809 "Unexpected number of shuffle inputs!")(((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"
) ? static_cast<void> (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33809, __PRETTY_FUNCTION__))
;
33810
33811 // Find the inputs that enter the chain. Note that multiple uses are OK
33812 // here, we're not going to remove the operands we find.
33813 bool UnaryShuffle = (Inputs.size() == 1);
33814 SDValue V1 = peekThroughBitcasts(Inputs[0]);
33815 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
33816 : peekThroughBitcasts(Inputs[1]));
33817
33818 MVT VT1 = V1.getSimpleValueType();
33819 MVT VT2 = V2.getSimpleValueType();
33820 MVT RootVT = Root.getSimpleValueType();
33821 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2
.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"
) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33823, __PRETTY_FUNCTION__))
33822 VT2.getSizeInBits() == RootVT.getSizeInBits() &&((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2
.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"
) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33823, __PRETTY_FUNCTION__))
33823 "Vector size mismatch")((VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2
.getSizeInBits() == RootVT.getSizeInBits() && "Vector size mismatch"
) ? static_cast<void> (0) : __assert_fail ("VT1.getSizeInBits() == RootVT.getSizeInBits() && VT2.getSizeInBits() == RootVT.getSizeInBits() && \"Vector size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33823, __PRETTY_FUNCTION__))
;
33824
33825 SDLoc DL(Root);
33826 SDValue Res;
33827
33828 unsigned NumBaseMaskElts = BaseMask.size();
33829 if (NumBaseMaskElts == 1) {
33830 assert(BaseMask[0] == 0 && "Invalid shuffle index found!")((BaseMask[0] == 0 && "Invalid shuffle index found!")
? static_cast<void> (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33830, __PRETTY_FUNCTION__))
;
33831 return DAG.getBitcast(RootVT, V1);
33832 }
33833
33834 unsigned RootSizeInBits = RootVT.getSizeInBits();
33835 unsigned NumRootElts = RootVT.getVectorNumElements();
33836 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
33837 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
33838 (RootVT.isFloatingPoint() && Depth >= 1) ||
33839 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
33840
33841 // Don't combine if we are a AVX512/EVEX target and the mask element size
33842 // is different from the root element size - this would prevent writemasks
33843 // from being reused.
33844 // TODO - this currently prevents all lane shuffles from occurring.
33845 // TODO - check for writemasks usage instead of always preventing combining.
33846 // TODO - attempt to narrow Mask back to writemask size.
33847 bool IsEVEXShuffle =
33848 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
33849
33850 // Attempt to match a subvector broadcast.
33851 // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
33852 if (UnaryShuffle &&
33853 (BaseMaskEltSizeInBits == 128 || BaseMaskEltSizeInBits == 256)) {
33854 SmallVector<int, 64> BroadcastMask(NumBaseMaskElts, 0);
33855 if (isTargetShuffleEquivalent(BaseMask, BroadcastMask)) {
33856 SDValue Src = Inputs[0];
33857 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
33858 Src.getOperand(0).isUndef() &&
33859 Src.getOperand(1).getValueSizeInBits() == BaseMaskEltSizeInBits &&
33860 MayFoldLoad(Src.getOperand(1)) && isNullConstant(Src.getOperand(2))) {
33861 return DAG.getBitcast(RootVT, DAG.getNode(X86ISD::SUBV_BROADCAST, DL,
33862 Src.getValueType(),
33863 Src.getOperand(1)));
33864 }
33865 }
33866 }
33867
33868 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
33869
33870 // Handle 128-bit lane shuffles of 256-bit vectors.
33871 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
33872 // we need to use the zeroing feature.
33873 // TODO - this should support binary shuffles.
33874 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
33875 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
33876 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
33877 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
33878 return SDValue(); // Nothing to do!
33879 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
33880 unsigned PermMask = 0;
33881 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
33882 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
33883
33884 Res = DAG.getBitcast(ShuffleVT, V1);
33885 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
33886 DAG.getUNDEF(ShuffleVT),
33887 DAG.getTargetConstant(PermMask, DL, MVT::i8));
33888 return DAG.getBitcast(RootVT, Res);
33889 }
33890
33891 // For masks that have been widened to 128-bit elements or more,
33892 // narrow back down to 64-bit elements.
33893 SmallVector<int, 64> Mask;
33894 if (BaseMaskEltSizeInBits > 64) {
33895 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"
) ? static_cast<void> (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33895, __PRETTY_FUNCTION__))
;
33896 int MaskScale = BaseMaskEltSizeInBits / 64;
33897 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
33898 } else {
33899 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
33900 }
33901
33902 unsigned NumMaskElts = Mask.size();
33903 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
33904
33905 // Determine the effective mask value type.
33906 FloatDomain &= (32 <= MaskEltSizeInBits);
33907 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
33908 : MVT::getIntegerVT(MaskEltSizeInBits);
33909 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
33910
33911 // Only allow legal mask types.
33912 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
33913 return SDValue();
33914
33915 // Attempt to match the mask against known shuffle patterns.
33916 MVT ShuffleSrcVT, ShuffleVT;
33917 unsigned Shuffle, PermuteImm;
33918
33919 // Which shuffle domains are permitted?
33920 // Permit domain crossing at higher combine depths.
33921 // TODO: Should we indicate which domain is preferred if both are allowed?
33922 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
33923 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
33924 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
33925
33926 // Determine zeroable mask elements.
33927 APInt KnownUndef, KnownZero;
33928 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
33929 APInt Zeroable = KnownUndef | KnownZero;
33930
33931 if (UnaryShuffle) {
33932 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
33933 // directly if we don't shuffle the lower element and we shuffle the upper
33934 // (zero) elements within themselves.
33935 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
33936 (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
33937 MaskEltSizeInBits) == 0) {
33938 unsigned Scale =
33939 cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
33940 MaskEltSizeInBits;
33941 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
33942 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
33943 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
33944 return DAG.getBitcast(RootVT, V1);
33945 }
33946 }
33947
33948 // Attempt to match against broadcast-from-vector.
33949 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
33950 if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
33951 && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
33952 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
33953 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
33954 if (V1.getValueType() == MaskVT &&
33955 V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
33956 MayFoldLoad(V1.getOperand(0))) {
33957 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
33958 return SDValue(); // Nothing to do!
33959 Res = V1.getOperand(0);
33960 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
33961 return DAG.getBitcast(RootVT, Res);
33962 }
33963 if (Subtarget.hasAVX2()) {
33964 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
33965 return SDValue(); // Nothing to do!
33966 Res = DAG.getBitcast(MaskVT, V1);
33967 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
33968 return DAG.getBitcast(RootVT, Res);
33969 }
33970 }
33971 }
33972
33973 SDValue NewV1 = V1; // Save operand in case early exit happens.
33974 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
33975 DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
33976 ShuffleVT) &&
33977 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
33978 if (Depth == 0 && Root.getOpcode() == Shuffle)
33979 return SDValue(); // Nothing to do!
33980 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
33981 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
33982 return DAG.getBitcast(RootVT, Res);
33983 }
33984
33985 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
33986 AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
33987 PermuteImm) &&
33988 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
33989 if (Depth == 0 && Root.getOpcode() == Shuffle)
33990 return SDValue(); // Nothing to do!
33991 Res = DAG.getBitcast(ShuffleVT, V1);
33992 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
33993 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
33994 return DAG.getBitcast(RootVT, Res);
33995 }
33996 }
33997
33998 SDValue NewV1 = V1; // Save operands in case early exit happens.
33999 SDValue NewV2 = V2;
34000 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
34001 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
34002 ShuffleVT, UnaryShuffle) &&
34003 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
34004 if (Depth == 0 && Root.getOpcode() == Shuffle)
34005 return SDValue(); // Nothing to do!
34006 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
34007 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
34008 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
34009 return DAG.getBitcast(RootVT, Res);
34010 }
34011
34012 NewV1 = V1; // Save operands in case early exit happens.
34013 NewV2 = V2;
34014 if (matchBinaryPermuteShuffle(
34015 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
34016 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
34017 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
34018 if (Depth == 0 && Root.getOpcode() == Shuffle)
34019 return SDValue(); // Nothing to do!
34020 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
34021 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
34022 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
34023 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
34024 return DAG.getBitcast(RootVT, Res);
34025 }
34026
34027 // Typically from here on, we need an integer version of MaskVT.
34028 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
34029 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
34030
34031 // Annoyingly, SSE4A instructions don't map into the above match helpers.
34032 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
34033 uint64_t BitLen, BitIdx;
34034 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
34035 Zeroable)) {
34036 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
34037 return SDValue(); // Nothing to do!
34038 V1 = DAG.getBitcast(IntMaskVT, V1);
34039 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
34040 DAG.getTargetConstant(BitLen, DL, MVT::i8),
34041 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
34042 return DAG.getBitcast(RootVT, Res);
34043 }
34044
34045 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
34046 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
34047 return SDValue(); // Nothing to do!
34048 V1 = DAG.getBitcast(IntMaskVT, V1);
34049 V2 = DAG.getBitcast(IntMaskVT, V2);
34050 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
34051 DAG.getTargetConstant(BitLen, DL, MVT::i8),
34052 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
34053 return DAG.getBitcast(RootVT, Res);
34054 }
34055 }
34056
34057 // Don't try to re-form single instruction chains under any circumstances now
34058 // that we've done encoding canonicalization for them.
34059 if (Depth < 1)
34060 return SDValue();
34061
34062 // Depth threshold above which we can efficiently use variable mask shuffles.
34063 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
34064 AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
34065
34066 bool MaskContainsZeros =
34067 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
34068
34069 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
34070 // If we have a single input lane-crossing shuffle then lower to VPERMV.
34071 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
34072 ((Subtarget.hasAVX2() &&
34073 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
34074 (Subtarget.hasAVX512() &&
34075 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
34076 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
34077 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
34078 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
34079 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
34080 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
34081 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
34082 Res = DAG.getBitcast(MaskVT, V1);
34083 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
34084 return DAG.getBitcast(RootVT, Res);
34085 }
34086
34087 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
34088 // vector as the second source.
34089 if (UnaryShuffle && AllowVariableMask &&
34090 ((Subtarget.hasAVX512() &&
34091 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
34092 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
34093 (Subtarget.hasVLX() &&
34094 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
34095 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
34096 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
34097 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
34098 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
34099 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
34100 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
34101 for (unsigned i = 0; i != NumMaskElts; ++i)
34102 if (Mask[i] == SM_SentinelZero)
34103 Mask[i] = NumMaskElts + i;
34104
34105 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
34106 Res = DAG.getBitcast(MaskVT, V1);
34107 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
34108 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
34109 return DAG.getBitcast(RootVT, Res);
34110 }
34111
34112 // If that failed and either input is extracted then try to combine as a
34113 // shuffle with the larger type.
34114 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
34115 Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
34116 DAG, Subtarget))
34117 return WideShuffle;
34118
34119 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
34120 if (AllowVariableMask && !MaskContainsZeros &&
34121 ((Subtarget.hasAVX512() &&
34122 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
34123 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
34124 (Subtarget.hasVLX() &&
34125 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
34126 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
34127 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
34128 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
34129 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
34130 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
34131 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
34132 V1 = DAG.getBitcast(MaskVT, V1);
34133 V2 = DAG.getBitcast(MaskVT, V2);
34134 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
34135 return DAG.getBitcast(RootVT, Res);
34136 }
34137 return SDValue();
34138 }
34139
34140 // See if we can combine a single input shuffle with zeros to a bit-mask,
34141 // which is much simpler than any shuffle.
34142 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
34143 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
34144 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
34145 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
34146 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
34147 APInt UndefElts(NumMaskElts, 0);
34148 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
34149 for (unsigned i = 0; i != NumMaskElts; ++i) {
34150 int M = Mask[i];
34151 if (M == SM_SentinelUndef) {
34152 UndefElts.setBit(i);
34153 continue;
34154 }
34155 if (M == SM_SentinelZero)
34156 continue;
34157 EltBits[i] = AllOnes;
34158 }
34159 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
34160 Res = DAG.getBitcast(MaskVT, V1);
34161 unsigned AndOpcode =
34162 MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
34163 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
34164 return DAG.getBitcast(RootVT, Res);
34165 }
34166
34167 // If we have a single input shuffle with different shuffle patterns in the
34168 // the 128-bit lanes use the variable mask to VPERMILPS.
34169 // TODO Combine other mask types at higher depths.
34170 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
34171 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
34172 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
34173 SmallVector<SDValue, 16> VPermIdx;
34174 for (int M : Mask) {
34175 SDValue Idx =
34176 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
34177 VPermIdx.push_back(Idx);
34178 }
34179 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
34180 Res = DAG.getBitcast(MaskVT, V1);
34181 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
34182 return DAG.getBitcast(RootVT, Res);
34183 }
34184
34185 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
34186 // to VPERMIL2PD/VPERMIL2PS.
34187 if (AllowVariableMask && Subtarget.hasXOP() &&
34188 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
34189 MaskVT == MVT::v8f32)) {
34190 // VPERMIL2 Operation.
34191 // Bits[3] - Match Bit.
34192 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
34193 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
34194 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
34195 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
34196 SmallVector<int, 8> VPerm2Idx;
34197 unsigned M2ZImm = 0;
34198 for (int M : Mask) {
34199 if (M == SM_SentinelUndef) {
34200 VPerm2Idx.push_back(-1);
34201 continue;
34202 }
34203 if (M == SM_SentinelZero) {
34204 M2ZImm = 2;
34205 VPerm2Idx.push_back(8);
34206 continue;
34207 }
34208 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
34209 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
34210 VPerm2Idx.push_back(Index);
34211 }
34212 V1 = DAG.getBitcast(MaskVT, V1);
34213 V2 = DAG.getBitcast(MaskVT, V2);
34214 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
34215 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
34216 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
34217 return DAG.getBitcast(RootVT, Res);
34218 }
34219
34220 // If we have 3 or more shuffle instructions or a chain involving a variable
34221 // mask, we can replace them with a single PSHUFB instruction profitably.
34222 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
34223 // instructions, but in practice PSHUFB tends to be *very* fast so we're
34224 // more aggressive.
34225 if (UnaryShuffle && AllowVariableMask &&
34226 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
34227 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
34228 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
34229 SmallVector<SDValue, 16> PSHUFBMask;
34230 int NumBytes = RootVT.getSizeInBits() / 8;
34231 int Ratio = NumBytes / NumMaskElts;
34232 for (int i = 0; i < NumBytes; ++i) {
34233 int M = Mask[i / Ratio];
34234 if (M == SM_SentinelUndef) {
34235 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
34236 continue;
34237 }
34238 if (M == SM_SentinelZero) {
34239 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
34240 continue;
34241 }
34242 M = Ratio * M + i % Ratio;
34243 assert((M / 16) == (i / 16) && "Lane crossing detected")(((M / 16) == (i / 16) && "Lane crossing detected") ?
static_cast<void> (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34243, __PRETTY_FUNCTION__))
;
34244 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
34245 }
34246 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
34247 Res = DAG.getBitcast(ByteVT, V1);
34248 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
34249 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
34250 return DAG.getBitcast(RootVT, Res);
34251 }
34252
34253 // With XOP, if we have a 128-bit binary input shuffle we can always combine
34254 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
34255 // slower than PSHUFB on targets that support both.
34256 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
34257 // VPPERM Mask Operation
34258 // Bits[4:0] - Byte Index (0 - 31)
34259 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
34260 SmallVector<SDValue, 16> VPPERMMask;
34261 int NumBytes = 16;
34262 int Ratio = NumBytes / NumMaskElts;
34263 for (int i = 0; i < NumBytes; ++i) {
34264 int M = Mask[i / Ratio];
34265 if (M == SM_SentinelUndef) {
34266 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
34267 continue;
34268 }
34269 if (M == SM_SentinelZero) {
34270 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
34271 continue;
34272 }
34273 M = Ratio * M + i % Ratio;
34274 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
34275 }
34276 MVT ByteVT = MVT::v16i8;
34277 V1 = DAG.getBitcast(ByteVT, V1);
34278 V2 = DAG.getBitcast(ByteVT, V2);
34279 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
34280 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
34281 return DAG.getBitcast(RootVT, Res);
34282 }
34283
34284 // If that failed and either input is extracted then try to combine as a
34285 // shuffle with the larger type.
34286 if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
34287 Inputs, Root, BaseMask, Depth, HasVariableMask, AllowVariableMask,
34288 DAG, Subtarget))
34289 return WideShuffle;
34290
34291 // If we have a dual input shuffle then lower to VPERMV3.
34292 if (!UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
34293 ((Subtarget.hasAVX512() &&
34294 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
34295 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
34296 (Subtarget.hasVLX() &&
34297 (MaskVT == MVT::v2f64 || MaskVT == MVT::v2i64 || MaskVT == MVT::v4f64 ||
34298 MaskVT == MVT::v4i64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 ||
34299 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
34300 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
34301 (Subtarget.hasBWI() && Subtarget.hasVLX() &&
34302 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16)) ||
34303 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
34304 (Subtarget.hasVBMI() && Subtarget.hasVLX() &&
34305 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8)))) {
34306 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
34307 V1 = DAG.getBitcast(MaskVT, V1);
34308 V2 = DAG.getBitcast(MaskVT, V2);
34309 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
34310 return DAG.getBitcast(RootVT, Res);
34311 }
34312
34313 // Failed to find any combines.
34314 return SDValue();
34315}
34316
34317// Combine an arbitrary chain of shuffles + extract_subvectors into a single
34318// instruction if possible.
34319//
34320// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
34321// type size to attempt to combine:
34322// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
34323// -->
34324// extract_subvector(shuffle(x,y,m2),0)
34325static SDValue combineX86ShuffleChainWithExtract(
34326 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
34327 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
34328 const X86Subtarget &Subtarget) {
34329 unsigned NumMaskElts = BaseMask.size();
34330 unsigned NumInputs = Inputs.size();
34331 if (NumInputs == 0)
34332 return SDValue();
34333
34334 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
34335 SmallVector<unsigned, 4> Offsets(NumInputs, 0);
34336
34337 // Peek through subvectors.
34338 // TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?
34339 unsigned WideSizeInBits = WideInputs[0].getValueSizeInBits();
34340 for (unsigned i = 0; i != NumInputs; ++i) {
34341 SDValue &Src = WideInputs[i];
34342 unsigned &Offset = Offsets[i];
34343 Src = peekThroughBitcasts(Src);
34344 EVT BaseVT = Src.getValueType();
34345 while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
34346 isa<ConstantSDNode>(Src.getOperand(1))) {
34347 Offset += Src.getConstantOperandVal(1);
34348 Src = Src.getOperand(0);
34349 }
34350 WideSizeInBits = std::max(WideSizeInBits,
34351 (unsigned)Src.getValueSizeInBits());
34352 assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34353, __PRETTY_FUNCTION__))
34353 "Unexpected subvector extraction")(((Offset % BaseVT.getVectorNumElements()) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34353, __PRETTY_FUNCTION__))
;
34354 Offset /= BaseVT.getVectorNumElements();
34355 Offset *= NumMaskElts;
34356 }
34357
34358 // Bail if we're always extracting from the lowest subvectors,
34359 // combineX86ShuffleChain should match this for the current width.
34360 if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))
34361 return SDValue();
34362
34363 EVT RootVT = Root.getValueType();
34364 unsigned RootSizeInBits = RootVT.getSizeInBits();
34365 unsigned Scale = WideSizeInBits / RootSizeInBits;
34366 assert((WideSizeInBits % RootSizeInBits) == 0 &&(((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34367, __PRETTY_FUNCTION__))
34367 "Unexpected subvector extraction")(((WideSizeInBits % RootSizeInBits) == 0 && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34367, __PRETTY_FUNCTION__))
;
34368
34369 // If the src vector types aren't the same, see if we can extend
34370 // them to match each other.
34371 // TODO: Support different scalar types?
34372 EVT WideSVT = WideInputs[0].getValueType().getScalarType();
34373 if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {
34374 return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||
34375 Op.getValueType().getScalarType() != WideSVT;
34376 }))
34377 return SDValue();
34378
34379 for (SDValue &NewInput : WideInputs) {
34380 assert((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&(((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch") ? static_cast<void> (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34381, __PRETTY_FUNCTION__))
34381 "Shuffle vector size mismatch")(((WideSizeInBits % NewInput.getValueSizeInBits()) == 0 &&
"Shuffle vector size mismatch") ? static_cast<void> (0
) : __assert_fail ("(WideSizeInBits % NewInput.getValueSizeInBits()) == 0 && \"Shuffle vector size mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34381, __PRETTY_FUNCTION__))
;
34382 if (WideSizeInBits > NewInput.getValueSizeInBits())
34383 NewInput = widenSubVector(NewInput, false, Subtarget, DAG,
34384 SDLoc(NewInput), WideSizeInBits);
34385 assert(WideSizeInBits == NewInput.getValueSizeInBits() &&((WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34386, __PRETTY_FUNCTION__))
34386 "Unexpected subvector extraction")((WideSizeInBits == NewInput.getValueSizeInBits() && "Unexpected subvector extraction"
) ? static_cast<void> (0) : __assert_fail ("WideSizeInBits == NewInput.getValueSizeInBits() && \"Unexpected subvector extraction\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34386, __PRETTY_FUNCTION__))
;
34387 }
34388
34389 // Create new mask for larger type.
34390 for (unsigned i = 1; i != NumInputs; ++i)
34391 Offsets[i] += i * Scale * NumMaskElts;
34392
34393 SmallVector<int, 64> WideMask(BaseMask.begin(), BaseMask.end());
34394 for (int &M : WideMask) {
34395 if (M < 0)
34396 continue;
34397 M = (M % NumMaskElts) + Offsets[M / NumMaskElts];
34398 }
34399 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
34400
34401 // Remove unused/repeated shuffle source ops.
34402 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
34403 assert(!WideInputs.empty() && "Shuffle with no inputs detected")((!WideInputs.empty() && "Shuffle with no inputs detected"
) ? static_cast<void> (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34403, __PRETTY_FUNCTION__))
;
34404
34405 if (WideInputs.size() > 2)
34406 return SDValue();
34407
34408 // Increase depth for every upper subvector we've peeked through.
34409 Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });
34410
34411 // Attempt to combine wider chain.
34412 // TODO: Can we use a better Root?
34413 SDValue WideRoot = WideInputs[0];
34414 if (SDValue WideShuffle = combineX86ShuffleChain(
34415 WideInputs, WideRoot, WideMask, Depth, HasVariableMask,
34416 AllowVariableMask, DAG, Subtarget)) {
34417 WideShuffle =
34418 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
34419 return DAG.getBitcast(RootVT, WideShuffle);
34420 }
34421 return SDValue();
34422}
34423
34424// Attempt to constant fold all of the constant source ops.
34425// Returns true if the entire shuffle is folded to a constant.
34426// TODO: Extend this to merge multiple constant Ops and update the mask.
34427static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
34428 ArrayRef<int> Mask, SDValue Root,
34429 bool HasVariableMask,
34430 SelectionDAG &DAG,
34431 const X86Subtarget &Subtarget) {
34432 MVT VT = Root.getSimpleValueType();
34433
34434 unsigned SizeInBits = VT.getSizeInBits();
34435 unsigned NumMaskElts = Mask.size();
34436 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
34437 unsigned NumOps = Ops.size();
34438
34439 // Extract constant bits from each source op.
34440 bool OneUseConstantOp = false;
34441 SmallVector<APInt, 16> UndefEltsOps(NumOps);
34442 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
34443 for (unsigned i = 0; i != NumOps; ++i) {
34444 SDValue SrcOp = Ops[i];
34445 OneUseConstantOp |= SrcOp.hasOneUse();
34446 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
34447 RawBitsOps[i]))
34448 return SDValue();
34449 }
34450
34451 // Only fold if at least one of the constants is only used once or
34452 // the combined shuffle has included a variable mask shuffle, this
34453 // is to avoid constant pool bloat.
34454 if (!OneUseConstantOp && !HasVariableMask)
34455 return SDValue();
34456
34457 // Shuffle the constant bits according to the mask.
34458 APInt UndefElts(NumMaskElts, 0);
34459 APInt ZeroElts(NumMaskElts, 0);
34460 APInt ConstantElts(NumMaskElts, 0);
34461 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
34462 APInt::getNullValue(MaskSizeInBits));
34463 for (unsigned i = 0; i != NumMaskElts; ++i) {
34464 int M = Mask[i];
34465 if (M == SM_SentinelUndef) {
34466 UndefElts.setBit(i);
34467 continue;
34468 } else if (M == SM_SentinelZero) {
34469 ZeroElts.setBit(i);
34470 continue;
34471 }
34472 assert(0 <= M && M < (int)(NumMaskElts * NumOps))((0 <= M && M < (int)(NumMaskElts * NumOps)) ? static_cast
<void> (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34472, __PRETTY_FUNCTION__))
;
34473
34474 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
34475 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
34476
34477 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
34478 if (SrcUndefElts[SrcMaskIdx]) {
34479 UndefElts.setBit(i);
34480 continue;
34481 }
34482
34483 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
34484 APInt &Bits = SrcEltBits[SrcMaskIdx];
34485 if (!Bits) {
34486 ZeroElts.setBit(i);
34487 continue;
34488 }
34489
34490 ConstantElts.setBit(i);
34491 ConstantBitData[i] = Bits;
34492 }
34493 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue())(((UndefElts | ZeroElts | ConstantElts).isAllOnesValue()) ? static_cast
<void> (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnesValue()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34493, __PRETTY_FUNCTION__))
;
34494
34495 // Create the constant data.
34496 MVT MaskSVT;
34497 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
34498 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
34499 else
34500 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
34501
34502 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
34503
34504 SDLoc DL(Root);
34505 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
34506 return DAG.getBitcast(VT, CstOp);
34507}
34508
34509/// Fully generic combining of x86 shuffle instructions.
34510///
34511/// This should be the last combine run over the x86 shuffle instructions. Once
34512/// they have been fully optimized, this will recursively consider all chains
34513/// of single-use shuffle instructions, build a generic model of the cumulative
34514/// shuffle operation, and check for simpler instructions which implement this
34515/// operation. We use this primarily for two purposes:
34516///
34517/// 1) Collapse generic shuffles to specialized single instructions when
34518/// equivalent. In most cases, this is just an encoding size win, but
34519/// sometimes we will collapse multiple generic shuffles into a single
34520/// special-purpose shuffle.
34521/// 2) Look for sequences of shuffle instructions with 3 or more total
34522/// instructions, and replace them with the slightly more expensive SSSE3
34523/// PSHUFB instruction if available. We do this as the last combining step
34524/// to ensure we avoid using PSHUFB if we can implement the shuffle with
34525/// a suitable short sequence of other instructions. The PSHUFB will either
34526/// use a register or have to read from memory and so is slightly (but only
34527/// slightly) more expensive than the other shuffle instructions.
34528///
34529/// Because this is inherently a quadratic operation (for each shuffle in
34530/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
34531/// This should never be an issue in practice as the shuffle lowering doesn't
34532/// produce sequences of more than 8 instructions.
34533///
34534/// FIXME: We will currently miss some cases where the redundant shuffling
34535/// would simplify under the threshold for PSHUFB formation because of
34536/// combine-ordering. To fix this, we should do the redundant instruction
34537/// combining in this recursive walk.
34538static SDValue combineX86ShufflesRecursively(
34539 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
34540 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
34541 bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
34542 const X86Subtarget &Subtarget) {
34543 assert(RootMask.size() > 0 &&((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34545, __PRETTY_FUNCTION__))
1
Assuming the condition is true
2
Assuming the condition is true
3
'?' condition is true
34544 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34545, __PRETTY_FUNCTION__))
34545 "Illegal shuffle root mask")((RootMask.size() > 0 && (RootMask.size() > 1 ||
(RootMask[0] == 0 && SrcOpIndex == 0)) && "Illegal shuffle root mask"
) ? static_cast<void> (0) : __assert_fail ("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34545, __PRETTY_FUNCTION__))
;
34546
34547 // Bound the depth of our recursive combine because this is ultimately
34548 // quadratic in nature.
34549 const unsigned MaxRecursionDepth = 8;
34550 if (Depth >= MaxRecursionDepth)
4
Assuming 'Depth' is < 'MaxRecursionDepth'
5
Taking false branch
34551 return SDValue();
34552
34553 // Directly rip through bitcasts to find the underlying operand.
34554 SDValue Op = SrcOps[SrcOpIndex];
34555 Op = peekThroughOneUseBitcasts(Op);
34556
34557 MVT VT = Op.getSimpleValueType();
34558 if (!VT.isVector())
6
Calling 'MVT::isVector'
10
Returning from 'MVT::isVector'
11
Taking false branch
34559 return SDValue(); // Bail if we hit a non-vector.
34560
34561 assert(Root.getSimpleValueType().isVector() &&((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34562, __PRETTY_FUNCTION__))
12
'?' condition is true
34562 "Shuffles operate on vector types!")((Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"
) ? static_cast<void> (0) : __assert_fail ("Root.getSimpleValueType().isVector() && \"Shuffles operate on vector types!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34562, __PRETTY_FUNCTION__))
;
34563 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits
() && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34564, __PRETTY_FUNCTION__))
13
'?' condition is true
34564 "Can only combine shuffles of the same vector register size.")((VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits
() && "Can only combine shuffles of the same vector register size."
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && \"Can only combine shuffles of the same vector register size.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34564, __PRETTY_FUNCTION__))
;
34565
34566 // Extract target shuffle mask and resolve sentinels and inputs.
34567 // TODO - determine Op's demanded elts from RootMask.
34568 SmallVector<int, 64> OpMask;
34569 SmallVector<SDValue, 2> OpInputs;
34570 APInt OpUndef, OpZero;
34571 APInt OpDemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
34572 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
34573 if (!getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
14
Calling 'getTargetShuffleInputs'
31
Returning from 'getTargetShuffleInputs'
32
Taking false branch
34574 OpZero, DAG, Depth, false))
34575 return SDValue();
34576
34577 // Shuffle inputs must be the same size as the result.
34578 if (llvm::any_of(OpInputs, [VT](SDValue Op) {
33
Calling 'any_of<llvm::SmallVector<llvm::SDValue, 2> &, (lambda at /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp:34578:30)>'
41
Returning from 'any_of<llvm::SmallVector<llvm::SDValue, 2> &, (lambda at /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp:34578:30)>'
42
Taking false branch
34579 return VT.getSizeInBits() != Op.getValueSizeInBits();
34580 }))
34581 return SDValue();
34582
34583 SmallVector<int, 64> Mask;
34584 SmallVector<SDValue, 16> Ops;
34585
34586 // We don't need to merge masks if the root is empty.
34587 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
43
Assuming 'Depth' is not equal to 0
34588 if (EmptyRoot
43.1
'EmptyRoot' is false
43.1
'EmptyRoot' is false
43.1
'EmptyRoot' is false
43.1
'EmptyRoot' is false
43.1
'EmptyRoot' is false
43.1
'EmptyRoot' is false
) {
44
Taking false branch
34589 // Only resolve zeros if it will remove an input, otherwise we might end
34590 // up in an infinite loop.
34591 bool ResolveKnownZeros = true;
34592 if (!OpZero.isNullValue()) {
34593 APInt UsedInputs = APInt::getNullValue(OpInputs.size());
34594 for (int i = 0, e = OpMask.size(); i != e; ++i) {
34595 int M = OpMask[i];
34596 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
34597 continue;
34598 UsedInputs.setBit(M / OpMask.size());
34599 if (UsedInputs.isAllOnesValue()) {
34600 ResolveKnownZeros = false;
34601 break;
34602 }
34603 }
34604 }
34605 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
34606 ResolveKnownZeros);
34607
34608 Mask = OpMask;
34609 Ops.append(OpInputs.begin(), OpInputs.end());
34610 } else {
34611 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
45
Calling 'resolveTargetShuffleFromZeroables'
52
Returning from 'resolveTargetShuffleFromZeroables'
34612
34613 // Add the inputs to the Ops list, avoiding duplicates.
34614 Ops.append(SrcOps.begin(), SrcOps.end());
34615
34616 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
34617 // Attempt to find an existing match.
34618 SDValue InputBC = peekThroughBitcasts(Input);
34619 for (int i = 0, e = Ops.size(); i < e; ++i)
34620 if (InputBC == peekThroughBitcasts(Ops[i]))
34621 return i;
34622 // Match failed - should we replace an existing Op?
34623 if (InsertionPoint >= 0) {
34624 Ops[InsertionPoint] = Input;
34625 return InsertionPoint;
34626 }
34627 // Add to the end of the Ops list.
34628 Ops.push_back(Input);
34629 return Ops.size() - 1;
34630 };
34631
34632 SmallVector<int, 2> OpInputIdx;
34633 for (SDValue OpInput : OpInputs)
53
Assuming '__begin2' is equal to '__end2'
34634 OpInputIdx.push_back(
34635 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
34636
34637 assert(((RootMask.size() > OpMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34642, __PRETTY_FUNCTION__))
54
Assuming the condition is true
55
Calling 'SmallVectorBase::size'
57
Returning from 'SmallVectorBase::size'
58
Division by zero
34638 RootMask.size() % OpMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34642, __PRETTY_FUNCTION__))
34639 (OpMask.size() > RootMask.size() &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34642, __PRETTY_FUNCTION__))
34640 OpMask.size() % RootMask.size() == 0) ||((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34642, __PRETTY_FUNCTION__))
34641 OpMask.size() == RootMask.size()) &&((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34642, __PRETTY_FUNCTION__))
34642 "The smaller number of elements must divide the larger.")((((RootMask.size() > OpMask.size() && RootMask.size
() % OpMask.size() == 0) || (OpMask.size() > RootMask.size
() && OpMask.size() % RootMask.size() == 0) || OpMask
.size() == RootMask.size()) && "The smaller number of elements must divide the larger."
) ? static_cast<void> (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34642, __PRETTY_FUNCTION__))
;
34643
34644 // This function can be performance-critical, so we rely on the power-of-2
34645 // knowledge that we have about the mask sizes to replace div/rem ops with
34646 // bit-masks and shifts.
34647 assert(isPowerOf2_32(RootMask.size()) &&((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34648, __PRETTY_FUNCTION__))
34648 "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34648, __PRETTY_FUNCTION__))
;
34649 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34649, __PRETTY_FUNCTION__))
;
34650 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
34651 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
34652
34653 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
34654 unsigned RootRatio =
34655 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
34656 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
34657 assert((RootRatio == 1 || OpRatio == 1) &&(((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34658, __PRETTY_FUNCTION__))
34658 "Must not have a ratio for both incoming and op masks!")(((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!"
) ? static_cast<void> (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34658, __PRETTY_FUNCTION__))
;
34659
34660 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34660, __PRETTY_FUNCTION__))
;
34661 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34661, __PRETTY_FUNCTION__))
;
34662 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")((isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? static_cast<void> (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34662, __PRETTY_FUNCTION__))
;
34663 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
34664 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
34665
34666 Mask.resize(MaskWidth, SM_SentinelUndef);
34667
34668 // Merge this shuffle operation's mask into our accumulated mask. Note that
34669 // this shuffle's mask will be the first applied to the input, followed by
34670 // the root mask to get us all the way to the root value arrangement. The
34671 // reason for this order is that we are recursing up the operation chain.
34672 for (unsigned i = 0; i < MaskWidth; ++i) {
34673 unsigned RootIdx = i >> RootRatioLog2;
34674 if (RootMask[RootIdx] < 0) {
34675 // This is a zero or undef lane, we're done.
34676 Mask[i] = RootMask[RootIdx];
34677 continue;
34678 }
34679
34680 unsigned RootMaskedIdx =
34681 RootRatio == 1
34682 ? RootMask[RootIdx]
34683 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
34684
34685 // Just insert the scaled root mask value if it references an input other
34686 // than the SrcOp we're currently inserting.
34687 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
34688 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
34689 Mask[i] = RootMaskedIdx;
34690 continue;
34691 }
34692
34693 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
34694 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
34695 if (OpMask[OpIdx] < 0) {
34696 // The incoming lanes are zero or undef, it doesn't matter which ones we
34697 // are using.
34698 Mask[i] = OpMask[OpIdx];
34699 continue;
34700 }
34701
34702 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
34703 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
34704 : (OpMask[OpIdx] << OpRatioLog2) +
34705 (RootMaskedIdx & (OpRatio - 1));
34706
34707 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
34708 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
34709 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")((0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input"
) ? static_cast<void> (0) : __assert_fail ("0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34709, __PRETTY_FUNCTION__))
;
34710 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
34711
34712 Mask[i] = OpMaskedIdx;
34713 }
34714 }
34715
34716 // Remove unused/repeated shuffle source ops.
34717 resolveTargetShuffleInputsAndMask(Ops, Mask);
34718
34719 // Handle the all undef/zero cases early.
34720 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
34721 return DAG.getUNDEF(Root.getValueType());
34722
34723 // TODO - should we handle the mixed zero/undef case as well? Just returning
34724 // a zero mask will lose information on undef elements possibly reducing
34725 // future combine possibilities.
34726 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
34727 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
34728 SDLoc(Root));
34729
34730 assert(!Ops.empty() && "Shuffle with no inputs detected")((!Ops.empty() && "Shuffle with no inputs detected") ?
static_cast<void> (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34730, __PRETTY_FUNCTION__))
;
34731 HasVariableMask |= IsOpVariableMask;
34732
34733 // Update the list of shuffle nodes that have been combined so far.
34734 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
34735 SrcNodes.end());
34736 CombinedNodes.push_back(Op.getNode());
34737
34738 // See if we can recurse into each shuffle source op (if it's a target
34739 // shuffle). The source op should only be generally combined if it either has
34740 // a single use (i.e. current Op) or all its users have already been combined,
34741 // if not then we can still combine but should prevent generation of variable
34742 // shuffles to avoid constant pool bloat.
34743 // Don't recurse if we already have more source ops than we can combine in
34744 // the remaining recursion depth.
34745 if (Ops.size() < (MaxRecursionDepth - Depth)) {
34746 for (int i = 0, e = Ops.size(); i < e; ++i) {
34747 // For empty roots, we need to resolve zeroable elements before combining
34748 // them with other shuffles.
34749 SmallVector<int, 64> ResolvedMask = Mask;
34750 if (EmptyRoot)
34751 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
34752 bool AllowVar = false;
34753 if (Ops[i].getNode()->hasOneUse() ||
34754 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
34755 AllowVar = AllowVariableMask;
34756 if (SDValue Res = combineX86ShufflesRecursively(
34757 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
34758 HasVariableMask, AllowVar, DAG, Subtarget))
34759 return Res;
34760 }
34761 }
34762
34763 // Attempt to constant fold all of the constant source ops.
34764 if (SDValue Cst = combineX86ShufflesConstants(
34765 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
34766 return Cst;
34767
34768 // We can only combine unary and binary shuffle mask cases.
34769 if (Ops.size() <= 2) {
34770 // Minor canonicalization of the accumulated shuffle mask to make it easier
34771 // to match below. All this does is detect masks with sequential pairs of
34772 // elements, and shrink them to the half-width mask. It does this in a loop
34773 // so it will reduce the size of the mask to the minimal width mask which
34774 // performs an equivalent shuffle.
34775 SmallVector<int, 64> WidenedMask;
34776 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
34777 Mask = std::move(WidenedMask);
34778 }
34779
34780 // Canonicalization of binary shuffle masks to improve pattern matching by
34781 // commuting the inputs.
34782 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
34783 ShuffleVectorSDNode::commuteMask(Mask);
34784 std::swap(Ops[0], Ops[1]);
34785 }
34786
34787 // Finally, try to combine into a single shuffle instruction.
34788 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
34789 AllowVariableMask, DAG, Subtarget);
34790 }
34791
34792 // If that failed and any input is extracted then try to combine as a
34793 // shuffle with the larger type.
34794 return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
34795 HasVariableMask, AllowVariableMask,
34796 DAG, Subtarget);
34797}
34798
34799/// Helper entry wrapper to combineX86ShufflesRecursively.
34800static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,
34801 const X86Subtarget &Subtarget) {
34802 return combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 0,
34803 /*HasVarMask*/ false,
34804 /*AllowVarMask*/ true, DAG, Subtarget);
34805}
34806
34807/// Get the PSHUF-style mask from PSHUF node.
34808///
34809/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
34810/// PSHUF-style masks that can be reused with such instructions.
34811static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
34812 MVT VT = N.getSimpleValueType();
34813 SmallVector<int, 4> Mask;
34814 SmallVector<SDValue, 2> Ops;
34815 bool IsUnary;
34816 bool HaveMask =
34817 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
34818 (void)HaveMask;
34819 assert(HaveMask)((HaveMask) ? static_cast<void> (0) : __assert_fail ("HaveMask"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34819, __PRETTY_FUNCTION__))
;
34820
34821 // If we have more than 128-bits, only the low 128-bits of shuffle mask
34822 // matter. Check that the upper masks are repeats and remove them.
34823 if (VT.getSizeInBits() > 128) {
34824 int LaneElts = 128 / VT.getScalarSizeInBits();
34825#ifndef NDEBUG
34826 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
34827 for (int j = 0; j < LaneElts; ++j)
34828 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!") ? static_cast<
void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34829, __PRETTY_FUNCTION__))
34829 "Mask doesn't repeat in high 128-bit lanes!")((Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
"Mask doesn't repeat in high 128-bit lanes!") ? static_cast<
void> (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34829, __PRETTY_FUNCTION__))
;
34830#endif
34831 Mask.resize(LaneElts);
34832 }
34833
34834 switch (N.getOpcode()) {
34835 case X86ISD::PSHUFD:
34836 return Mask;
34837 case X86ISD::PSHUFLW:
34838 Mask.resize(4);
34839 return Mask;
34840 case X86ISD::PSHUFHW:
34841 Mask.erase(Mask.begin(), Mask.begin() + 4);
34842 for (int &M : Mask)
34843 M -= 4;
34844 return Mask;
34845 default:
34846 llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34846)
;
34847 }
34848}
34849
34850/// Search for a combinable shuffle across a chain ending in pshufd.
34851///
34852/// We walk up the chain and look for a combinable shuffle, skipping over
34853/// shuffles that we could hoist this shuffle's transformation past without
34854/// altering anything.
34855static SDValue
34856combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
34857 SelectionDAG &DAG) {
34858 assert(N.getOpcode() == X86ISD::PSHUFD &&((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34859, __PRETTY_FUNCTION__))
34859 "Called with something other than an x86 128-bit half shuffle!")((N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"
) ? static_cast<void> (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34859, __PRETTY_FUNCTION__))
;
34860 SDLoc DL(N);
34861
34862 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
34863 // of the shuffles in the chain so that we can form a fresh chain to replace
34864 // this one.
34865 SmallVector<SDValue, 8> Chain;
34866 SDValue V = N.getOperand(0);
34867 for (; V.hasOneUse(); V = V.getOperand(0)) {
34868 switch (V.getOpcode()) {
34869 default:
34870 return SDValue(); // Nothing combined!
34871
34872 case ISD::BITCAST:
34873 // Skip bitcasts as we always know the type for the target specific
34874 // instructions.
34875 continue;
34876
34877 case X86ISD::PSHUFD:
34878 // Found another dword shuffle.
34879 break;
34880
34881 case X86ISD::PSHUFLW:
34882 // Check that the low words (being shuffled) are the identity in the
34883 // dword shuffle, and the high words are self-contained.
34884 if (Mask[0] != 0 || Mask[1] != 1 ||
34885 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
34886 return SDValue();
34887
34888 Chain.push_back(V);
34889 continue;
34890
34891 case X86ISD::PSHUFHW:
34892 // Check that the high words (being shuffled) are the identity in the
34893 // dword shuffle, and the low words are self-contained.
34894 if (Mask[2] != 2 || Mask[3] != 3 ||
34895 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
34896 return SDValue();
34897
34898 Chain.push_back(V);
34899 continue;
34900
34901 case X86ISD::UNPCKL:
34902 case X86ISD::UNPCKH:
34903 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
34904 // shuffle into a preceding word shuffle.
34905 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
34906 V.getSimpleValueType().getVectorElementType() != MVT::i16)
34907 return SDValue();
34908
34909 // Search for a half-shuffle which we can combine with.
34910 unsigned CombineOp =
34911 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
34912 if (V.getOperand(0) != V.getOperand(1) ||
34913 !V->isOnlyUserOf(V.getOperand(0).getNode()))
34914 return SDValue();
34915 Chain.push_back(V);
34916 V = V.getOperand(0);
34917 do {
34918 switch (V.getOpcode()) {
34919 default:
34920 return SDValue(); // Nothing to combine.
34921
34922 case X86ISD::PSHUFLW:
34923 case X86ISD::PSHUFHW:
34924 if (V.getOpcode() == CombineOp)
34925 break;
34926
34927 Chain.push_back(V);
34928
34929 LLVM_FALLTHROUGH[[gnu::fallthrough]];
34930 case ISD::BITCAST:
34931 V = V.getOperand(0);
34932 continue;
34933 }
34934 break;
34935 } while (V.hasOneUse());
34936 break;
34937 }
34938 // Break out of the loop if we break out of the switch.
34939 break;
34940 }
34941
34942 if (!V.hasOneUse())
34943 // We fell out of the loop without finding a viable combining instruction.
34944 return SDValue();
34945
34946 // Merge this node's mask and our incoming mask.
34947 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
34948 for (int &M : Mask)
34949 M = VMask[M];
34950 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
34951 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
34952
34953 // Rebuild the chain around this new shuffle.
34954 while (!Chain.empty()) {
34955 SDValue W = Chain.pop_back_val();
34956
34957 if (V.getValueType() != W.getOperand(0).getValueType())
34958 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
34959
34960 switch (W.getOpcode()) {
34961 default:
34962 llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34962)
;
34963
34964 case X86ISD::UNPCKL:
34965 case X86ISD::UNPCKH:
34966 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
34967 break;
34968
34969 case X86ISD::PSHUFD:
34970 case X86ISD::PSHUFLW:
34971 case X86ISD::PSHUFHW:
34972 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
34973 break;
34974 }
34975 }
34976 if (V.getValueType() != N.getValueType())
34977 V = DAG.getBitcast(N.getValueType(), V);
34978
34979 // Return the new chain to replace N.
34980 return V;
34981}
34982
34983// Attempt to commute shufps LHS loads:
34984// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
34985static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
34986 SelectionDAG &DAG) {
34987 // TODO: Add vXf64 support.
34988 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
34989 return SDValue();
34990
34991 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
34992 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
34993 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
34994 return SDValue();
34995 SDValue N0 = V.getOperand(0);
34996 SDValue N1 = V.getOperand(1);
34997 unsigned Imm = V.getConstantOperandVal(2);
34998 if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
34999 MayFoldLoad(peekThroughOneUseBitcasts(N1)))
35000 return SDValue();
35001 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
35002 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
35003 DAG.getTargetConstant(Imm, DL, MVT::i8));
35004 };
35005
35006 switch (N.getOpcode()) {
35007 case X86ISD::VPERMILPI:
35008 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
35009 unsigned Imm = N.getConstantOperandVal(1);
35010 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
35011 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
35012 }
35013 break;
35014 case X86ISD::SHUFP: {
35015 SDValue N0 = N.getOperand(0);
35016 SDValue N1 = N.getOperand(1);
35017 unsigned Imm = N.getConstantOperandVal(2);
35018 if (N0 == N1) {
35019 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
35020 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
35021 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
35022 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
35023 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
35024 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
35025 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
35026 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
35027 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
35028 }
35029 break;
35030 }
35031 }
35032
35033 return SDValue();
35034}
35035
35036/// Try to combine x86 target specific shuffles.
35037static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
35038 TargetLowering::DAGCombinerInfo &DCI,
35039 const X86Subtarget &Subtarget) {
35040 SDLoc DL(N);
35041 MVT VT = N.getSimpleValueType();
35042 SmallVector<int, 4> Mask;
35043 unsigned Opcode = N.getOpcode();
35044
35045 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
35046 // single instruction.
35047 if (VT.getScalarSizeInBits() == 64 &&
35048 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
35049 Opcode == X86ISD::UNPCKL)) {
35050 auto BC0 = peekThroughBitcasts(N.getOperand(0));
35051 auto BC1 = peekThroughBitcasts(N.getOperand(1));
35052 EVT VT0 = BC0.getValueType();
35053 EVT VT1 = BC1.getValueType();
35054 unsigned Opcode0 = BC0.getOpcode();
35055 unsigned Opcode1 = BC1.getOpcode();
35056 if (Opcode0 == Opcode1 && VT0 == VT1 &&
35057 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
35058 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
35059 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
35060 SDValue Lo, Hi;
35061 if (Opcode == X86ISD::MOVSD) {
35062 Lo = BC1.getOperand(0);
35063 Hi = BC0.getOperand(1);
35064 } else {
35065 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
35066 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
35067 }
35068 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
35069 return DAG.getBitcast(VT, Horiz);
35070 }
35071 }
35072
35073 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
35074 return R;
35075
35076 switch (Opcode) {
35077 case X86ISD::MOVDDUP: {
35078 SDValue Src = N.getOperand(0);
35079 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
35080 if (VT == MVT::v2f64 && Src.hasOneUse() &&
35081 ISD::isNormalLoad(Src.getNode())) {
35082 LoadSDNode *LN = cast<LoadSDNode>(Src);
35083 // Unless the load is volatile or atomic.
35084 if (LN->isSimple()) {
35085 SDVTList Tys = DAG.getVTList(MVT::v2f64, MVT::Other);
35086 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35087 SDValue VZLoad =
35088 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::f64,
35089 LN->getPointerInfo(),
35090 LN->getAlignment(),
35091 LN->getMemOperand()->getFlags());
35092 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
35093 DCI.CombineTo(N.getNode(), Movddup);
35094 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
35095 DCI.recursivelyDeleteUnusedNodes(LN);
35096 return N; // Return N so it doesn't get rechecked!
35097 }
35098 }
35099
35100 return SDValue();
35101 }
35102 case X86ISD::VBROADCAST: {
35103 SDValue Src = N.getOperand(0);
35104 SDValue BC = peekThroughBitcasts(Src);
35105 EVT SrcVT = Src.getValueType();
35106 EVT BCVT = BC.getValueType();
35107
35108 // If broadcasting from another shuffle, attempt to simplify it.
35109 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
35110 if (isTargetShuffle(BC.getOpcode()) &&
35111 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
35112 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
35113 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
35114 SM_SentinelUndef);
35115 for (unsigned i = 0; i != Scale; ++i)
35116 DemandedMask[i] = i;
35117 if (SDValue Res = combineX86ShufflesRecursively(
35118 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
35119 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
35120 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
35121 DAG.getBitcast(SrcVT, Res));
35122 }
35123
35124 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
35125 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
35126 if (Src.getOpcode() == ISD::BITCAST &&
35127 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
35128 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
35129 VT.getVectorNumElements());
35130 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
35131 }
35132
35133 // Reduce broadcast source vector to lowest 128-bits.
35134 if (SrcVT.getSizeInBits() > 128)
35135 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
35136 extract128BitVector(Src, 0, DAG, DL));
35137
35138 // broadcast(scalar_to_vector(x)) -> broadcast(x).
35139 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
35140 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
35141
35142 // Share broadcast with the longest vector and extract low subvector (free).
35143 for (SDNode *User : Src->uses())
35144 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
35145 User->getValueSizeInBits(0) > VT.getSizeInBits()) {
35146 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
35147 VT.getSizeInBits());
35148 }
35149
35150 // vbroadcast(scalarload X) -> vbroadcast_load X
35151 // For float loads, extract other uses of the scalar from the broadcast.
35152 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
35153 ISD::isNormalLoad(Src.getNode())) {
35154 LoadSDNode *LN = cast<LoadSDNode>(Src);
35155 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35156 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35157 SDValue BcastLd =
35158 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
35159 LN->getMemoryVT(), LN->getMemOperand());
35160 // If the load value is used only by N, replace it via CombineTo N.
35161 bool NoReplaceExtract = Src.hasOneUse();
35162 DCI.CombineTo(N.getNode(), BcastLd);
35163 if (NoReplaceExtract) {
35164 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
35165 DCI.recursivelyDeleteUnusedNodes(LN);
35166 } else {
35167 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
35168 DAG.getIntPtrConstant(0, DL));
35169 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
35170 }
35171 return N; // Return N so it doesn't get rechecked!
35172 }
35173
35174 // vbroadcast(vzload X) -> vbroadcast_load X
35175 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
35176 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
35177 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
35178 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35179 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35180 SDValue BcastLd =
35181 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
35182 LN->getMemoryVT(), LN->getMemOperand());
35183 DCI.CombineTo(N.getNode(), BcastLd);
35184 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
35185 DCI.recursivelyDeleteUnusedNodes(LN);
35186 return N; // Return N so it doesn't get rechecked!
35187 }
35188 }
35189
35190 // vbroadcast(vector load X) -> vbroadcast_load
35191 if (SrcVT == MVT::v2f64 && Src.hasOneUse() &&
35192 ISD::isNormalLoad(Src.getNode())) {
35193 LoadSDNode *LN = cast<LoadSDNode>(Src);
35194 // Unless the load is volatile or atomic.
35195 if (LN->isSimple()) {
35196 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35197 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35198 SDValue BcastLd =
35199 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
35200 MVT::f64, LN->getPointerInfo(),
35201 LN->getAlignment(),
35202 LN->getMemOperand()->getFlags());
35203 DCI.CombineTo(N.getNode(), BcastLd);
35204 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
35205 DCI.recursivelyDeleteUnusedNodes(LN);
35206 return N; // Return N so it doesn't get rechecked!
35207 }
35208 }
35209
35210 return SDValue();
35211 }
35212 case X86ISD::BLENDI: {
35213 SDValue N0 = N.getOperand(0);
35214 SDValue N1 = N.getOperand(1);
35215
35216 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
35217 // TODO: Handle MVT::v16i16 repeated blend mask.
35218 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
35219 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
35220 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
35221 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
35222 SrcVT.getScalarSizeInBits() >= 32) {
35223 unsigned BlendMask = N.getConstantOperandVal(2);
35224 unsigned Size = VT.getVectorNumElements();
35225 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
35226 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);
35227 return DAG.getBitcast(
35228 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
35229 N1.getOperand(0),
35230 DAG.getTargetConstant(BlendMask, DL, MVT::i8)));
35231 }
35232 }
35233 return SDValue();
35234 }
35235 case X86ISD::VPERMI: {
35236 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
35237 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
35238 SDValue N0 = N.getOperand(0);
35239 SDValue N1 = N.getOperand(1);
35240 unsigned EltSizeInBits = VT.getScalarSizeInBits();
35241 if (N0.getOpcode() == ISD::BITCAST &&
35242 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
35243 SDValue Src = N0.getOperand(0);
35244 EVT SrcVT = Src.getValueType();
35245 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
35246 return DAG.getBitcast(VT, Res);
35247 }
35248 return SDValue();
35249 }
35250 case X86ISD::VPERM2X128: {
35251 // If both 128-bit values were inserted into high halves of 256-bit values,
35252 // the shuffle can be reduced to a concatenation of subvectors:
35253 // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
35254 // Note: We are only looking for the exact high/high shuffle mask because we
35255 // expect to fold other similar patterns before creating this opcode.
35256 SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
35257 SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
35258 unsigned Imm = N.getConstantOperandVal(2);
35259 if (!(Imm == 0x31 &&
35260 Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
35261 Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
35262 Ins0.getValueType() == Ins1.getValueType() &&
35263 isa<ConstantSDNode>(Ins0.getOperand(2)) &&
35264 isa<ConstantSDNode>(Ins1.getOperand(2))))
35265 return SDValue();
35266
35267 SDValue X = Ins0.getOperand(1);
35268 SDValue Y = Ins1.getOperand(1);
35269 unsigned C1 = Ins0.getConstantOperandVal(2);
35270 unsigned C2 = Ins1.getConstantOperandVal(2);
35271 MVT SrcVT = X.getSimpleValueType();
35272 unsigned SrcElts = SrcVT.getVectorNumElements();
35273 if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||
35274 C1 != SrcElts || C2 != SrcElts)
35275 return SDValue();
35276
35277 return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
35278 Ins1.getValueType(), X, Y));
35279 }
35280 case X86ISD::PSHUFD:
35281 case X86ISD::PSHUFLW:
35282 case X86ISD::PSHUFHW:
35283 Mask = getPSHUFShuffleMask(N);
35284 assert(Mask.size() == 4)((Mask.size() == 4) ? static_cast<void> (0) : __assert_fail
("Mask.size() == 4", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35284, __PRETTY_FUNCTION__))
;
35285 break;
35286 case X86ISD::MOVSD:
35287 case X86ISD::MOVSS: {
35288 SDValue N0 = N.getOperand(0);
35289 SDValue N1 = N.getOperand(1);
35290
35291 // Canonicalize scalar FPOps:
35292 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
35293 // If commutable, allow OP(N1[0], N0[0]).
35294 unsigned Opcode1 = N1.getOpcode();
35295 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
35296 Opcode1 == ISD::FDIV) {
35297 SDValue N10 = N1.getOperand(0);
35298 SDValue N11 = N1.getOperand(1);
35299 if (N10 == N0 ||
35300 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
35301 if (N10 != N0)
35302 std::swap(N10, N11);
35303 MVT SVT = VT.getVectorElementType();
35304 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
35305 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
35306 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
35307 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
35308 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
35309 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
35310 }
35311 }
35312
35313 return SDValue();
35314 }
35315 case X86ISD::INSERTPS: {
35316 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")((VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? static_cast<void> (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35316, __PRETTY_FUNCTION__))
;
35317 SDValue Op0 = N.getOperand(0);
35318 SDValue Op1 = N.getOperand(1);
35319 SDValue Op2 = N.getOperand(2);
35320 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
35321 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
35322 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
35323 unsigned ZeroMask = InsertPSMask & 0xF;
35324
35325 // If we zero out all elements from Op0 then we don't need to reference it.
35326 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
35327 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
35328 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
35329
35330 // If we zero out the element from Op1 then we don't need to reference it.
35331 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
35332 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
35333 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
35334
35335 // Attempt to merge insertps Op1 with an inner target shuffle node.
35336 SmallVector<int, 8> TargetMask1;
35337 SmallVector<SDValue, 2> Ops1;
35338 APInt KnownUndef1, KnownZero1;
35339 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
35340 KnownZero1)) {
35341 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
35342 // Zero/UNDEF insertion - zero out element and remove dependency.
35343 InsertPSMask |= (1u << DstIdx);
35344 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
35345 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
35346 }
35347 // Update insertps mask srcidx and reference the source input directly.
35348 int M = TargetMask1[SrcIdx];
35349 assert(0 <= M && M < 8 && "Shuffle index out of range")((0 <= M && M < 8 && "Shuffle index out of range"
) ? static_cast<void> (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35349, __PRETTY_FUNCTION__))
;
35350 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
35351 Op1 = Ops1[M < 4 ? 0 : 1];
35352 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
35353 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
35354 }
35355
35356 // Attempt to merge insertps Op0 with an inner target shuffle node.
35357 SmallVector<int, 8> TargetMask0;
35358 SmallVector<SDValue, 2> Ops0;
35359 APInt KnownUndef0, KnownZero0;
35360 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
35361 KnownZero0)) {
35362 bool Updated = false;
35363 bool UseInput00 = false;
35364 bool UseInput01 = false;
35365 for (int i = 0; i != 4; ++i) {
35366 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
35367 // No change if element is already zero or the inserted element.
35368 continue;
35369 } else if (KnownUndef0[i] || KnownZero0[i]) {
35370 // If the target mask is undef/zero then we must zero the element.
35371 InsertPSMask |= (1u << i);
35372 Updated = true;
35373 continue;
35374 }
35375
35376 // The input vector element must be inline.
35377 int M = TargetMask0[i];
35378 if (M != i && M != (i + 4))
35379 return SDValue();
35380
35381 // Determine which inputs of the target shuffle we're using.
35382 UseInput00 |= (0 <= M && M < 4);
35383 UseInput01 |= (4 <= M);
35384 }
35385
35386 // If we're not using both inputs of the target shuffle then use the
35387 // referenced input directly.
35388 if (UseInput00 && !UseInput01) {
35389 Updated = true;
35390 Op0 = Ops0[0];
35391 } else if (!UseInput00 && UseInput01) {
35392 Updated = true;
35393 Op0 = Ops0[1];
35394 }
35395
35396 if (Updated)
35397 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
35398 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
35399 }
35400
35401 // If we're inserting an element from a vbroadcast load, fold the
35402 // load into the X86insertps instruction. We need to convert the scalar
35403 // load to a vector and clear the source lane of the INSERTPS control.
35404 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
35405 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
35406 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
35407 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
35408 MemIntr->getBasePtr(),
35409 MemIntr->getMemOperand());
35410 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
35411 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
35412 Load),
35413 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
35414 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
35415 return Insert;
35416 }
35417 }
35418
35419 return SDValue();
35420 }
35421 default:
35422 return SDValue();
35423 }
35424
35425 // Nuke no-op shuffles that show up after combining.
35426 if (isNoopShuffleMask(Mask))
35427 return N.getOperand(0);
35428
35429 // Look for simplifications involving one or two shuffle instructions.
35430 SDValue V = N.getOperand(0);
35431 switch (N.getOpcode()) {
35432 default:
35433 break;
35434 case X86ISD::PSHUFLW:
35435 case X86ISD::PSHUFHW:
35436 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")((VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"
) ? static_cast<void> (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35436, __PRETTY_FUNCTION__))
;
35437
35438 // See if this reduces to a PSHUFD which is no more expensive and can
35439 // combine with more operations. Note that it has to at least flip the
35440 // dwords as otherwise it would have been removed as a no-op.
35441 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
35442 int DMask[] = {0, 1, 2, 3};
35443 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
35444 DMask[DOffset + 0] = DOffset + 1;
35445 DMask[DOffset + 1] = DOffset + 0;
35446 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
35447 V = DAG.getBitcast(DVT, V);
35448 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
35449 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
35450 return DAG.getBitcast(VT, V);
35451 }
35452
35453 // Look for shuffle patterns which can be implemented as a single unpack.
35454 // FIXME: This doesn't handle the location of the PSHUFD generically, and
35455 // only works when we have a PSHUFD followed by two half-shuffles.
35456 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
35457 (V.getOpcode() == X86ISD::PSHUFLW ||
35458 V.getOpcode() == X86ISD::PSHUFHW) &&
35459 V.getOpcode() != N.getOpcode() &&
35460 V.hasOneUse()) {
35461 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
35462 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
35463 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
35464 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
35465 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
35466 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
35467 int WordMask[8];
35468 for (int i = 0; i < 4; ++i) {
35469 WordMask[i + NOffset] = Mask[i] + NOffset;
35470 WordMask[i + VOffset] = VMask[i] + VOffset;
35471 }
35472 // Map the word mask through the DWord mask.
35473 int MappedMask[8];
35474 for (int i = 0; i < 8; ++i)
35475 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
35476 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
35477 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
35478 // We can replace all three shuffles with an unpack.
35479 V = DAG.getBitcast(VT, D.getOperand(0));
35480 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
35481 : X86ISD::UNPCKH,
35482 DL, VT, V, V);
35483 }
35484 }
35485 }
35486
35487 break;
35488
35489 case X86ISD::PSHUFD:
35490 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
35491 return NewN;
35492
35493 break;
35494 }
35495
35496 return SDValue();
35497}
35498
35499/// Checks if the shuffle mask takes subsequent elements
35500/// alternately from two vectors.
35501/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
35502static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
35503
35504 int ParitySrc[2] = {-1, -1};
35505 unsigned Size = Mask.size();
35506 for (unsigned i = 0; i != Size; ++i) {
35507 int M = Mask[i];
35508 if (M < 0)
35509 continue;
35510
35511 // Make sure we are using the matching element from the input.
35512 if ((M % Size) != i)
35513 return false;
35514
35515 // Make sure we use the same input for all elements of the same parity.
35516 int Src = M / Size;
35517 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
35518 return false;
35519 ParitySrc[i % 2] = Src;
35520 }
35521
35522 // Make sure each input is used.
35523 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
35524 return false;
35525
35526 Op0Even = ParitySrc[0] == 0;
35527 return true;
35528}
35529
35530/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
35531/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
35532/// are written to the parameters \p Opnd0 and \p Opnd1.
35533///
35534/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
35535/// so it is easier to generically match. We also insert dummy vector shuffle
35536/// nodes for the operands which explicitly discard the lanes which are unused
35537/// by this operation to try to flow through the rest of the combiner
35538/// the fact that they're unused.
35539static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
35540 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
35541 bool &IsSubAdd) {
35542
35543 EVT VT = N->getValueType(0);
35544 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35545 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
35546 !VT.getSimpleVT().isFloatingPoint())
35547 return false;
35548
35549 // We only handle target-independent shuffles.
35550 // FIXME: It would be easy and harmless to use the target shuffle mask
35551 // extraction tool to support more.
35552 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
35553 return false;
35554
35555 SDValue V1 = N->getOperand(0);
35556 SDValue V2 = N->getOperand(1);
35557
35558 // Make sure we have an FADD and an FSUB.
35559 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
35560 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
35561 V1.getOpcode() == V2.getOpcode())
35562 return false;
35563
35564 // If there are other uses of these operations we can't fold them.
35565 if (!V1->hasOneUse() || !V2->hasOneUse())
35566 return false;
35567
35568 // Ensure that both operations have the same operands. Note that we can
35569 // commute the FADD operands.
35570 SDValue LHS, RHS;
35571 if (V1.getOpcode() == ISD::FSUB) {
35572 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
35573 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
35574 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
35575 return false;
35576 } else {
35577 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")((V2.getOpcode() == ISD::FSUB && "Unexpected opcode")
? static_cast<void> (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35577, __PRETTY_FUNCTION__))
;
35578 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
35579 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
35580 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
35581 return false;
35582 }
35583
35584 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
35585 bool Op0Even;
35586 if (!isAddSubOrSubAddMask(Mask, Op0Even))
35587 return false;
35588
35589 // It's a subadd if the vector in the even parity is an FADD.
35590 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
35591 : V2->getOpcode() == ISD::FADD;
35592
35593 Opnd0 = LHS;
35594 Opnd1 = RHS;
35595 return true;
35596}
35597
35598/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
35599static SDValue combineShuffleToFMAddSub(SDNode *N,
35600 const X86Subtarget &Subtarget,
35601 SelectionDAG &DAG) {
35602 // We only handle target-independent shuffles.
35603 // FIXME: It would be easy and harmless to use the target shuffle mask
35604 // extraction tool to support more.
35605 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
35606 return SDValue();
35607
35608 MVT VT = N->getSimpleValueType(0);
35609 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35610 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
35611 return SDValue();
35612
35613 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
35614 SDValue Op0 = N->getOperand(0);
35615 SDValue Op1 = N->getOperand(1);
35616 SDValue FMAdd = Op0, FMSub = Op1;
35617 if (FMSub.getOpcode() != X86ISD::FMSUB)
35618 std::swap(FMAdd, FMSub);
35619
35620 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
35621 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
35622 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
35623 FMAdd.getOperand(2) != FMSub.getOperand(2))
35624 return SDValue();
35625
35626 // Check for correct shuffle mask.
35627 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
35628 bool Op0Even;
35629 if (!isAddSubOrSubAddMask(Mask, Op0Even))
35630 return SDValue();
35631
35632 // FMAddSub takes zeroth operand from FMSub node.
35633 SDLoc DL(N);
35634 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
35635 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
35636 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
35637 FMAdd.getOperand(2));
35638}
35639
35640/// Try to combine a shuffle into a target-specific add-sub or
35641/// mul-add-sub node.
35642static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
35643 const X86Subtarget &Subtarget,
35644 SelectionDAG &DAG) {
35645 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
35646 return V;
35647
35648 SDValue Opnd0, Opnd1;
35649 bool IsSubAdd;
35650 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
35651 return SDValue();
35652
35653 MVT VT = N->getSimpleValueType(0);
35654 SDLoc DL(N);
35655
35656 // Try to generate X86ISD::FMADDSUB node here.
35657 SDValue Opnd2;
35658 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
35659 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
35660 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
35661 }
35662
35663 if (IsSubAdd)
35664 return SDValue();
35665
35666 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
35667 // the ADDSUB idiom has been successfully recognized. There are no known
35668 // X86 targets with 512-bit ADDSUB instructions!
35669 if (VT.is512BitVector())
35670 return SDValue();
35671
35672 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
35673}
35674
35675// We are looking for a shuffle where both sources are concatenated with undef
35676// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
35677// if we can express this as a single-source shuffle, that's preferable.
35678static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
35679 const X86Subtarget &Subtarget) {
35680 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
35681 return SDValue();
35682
35683 EVT VT = N->getValueType(0);
35684
35685 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
35686 if (!VT.is128BitVector() && !VT.is256BitVector())
35687 return SDValue();
35688
35689 if (VT.getVectorElementType() != MVT::i32 &&
35690 VT.getVectorElementType() != MVT::i64 &&
35691 VT.getVectorElementType() != MVT::f32 &&
35692 VT.getVectorElementType() != MVT::f64)
35693 return SDValue();
35694
35695 SDValue N0 = N->getOperand(0);
35696 SDValue N1 = N->getOperand(1);
35697
35698 // Check that both sources are concats with undef.
35699 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
35700 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
35701 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
35702 !N1.getOperand(1).isUndef())
35703 return SDValue();
35704
35705 // Construct the new shuffle mask. Elements from the first source retain their
35706 // index, but elements from the second source no longer need to skip an undef.
35707 SmallVector<int, 8> Mask;
35708 int NumElts = VT.getVectorNumElements();
35709
35710 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
35711 for (int Elt : SVOp->getMask())
35712 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
35713
35714 SDLoc DL(N);
35715 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
35716 N1.getOperand(0));
35717 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
35718}
35719
35720/// Eliminate a redundant shuffle of a horizontal math op.
35721static SDValue foldShuffleOfHorizOp(SDNode *N, SelectionDAG &DAG) {
35722 unsigned Opcode = N->getOpcode();
35723 if (Opcode != X86ISD::MOVDDUP && Opcode != X86ISD::VBROADCAST)
35724 if (Opcode != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
35725 return SDValue();
35726
35727 // For a broadcast, peek through an extract element of index 0 to find the
35728 // horizontal op: broadcast (ext_vec_elt HOp, 0)
35729 EVT VT = N->getValueType(0);
35730 if (Opcode == X86ISD::VBROADCAST) {
35731 SDValue SrcOp = N->getOperand(0);
35732 if (SrcOp.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
35733 SrcOp.getValueType() == MVT::f64 &&
35734 SrcOp.getOperand(0).getValueType() == VT &&
35735 isNullConstant(SrcOp.getOperand(1)))
35736 N = SrcOp.getNode();
35737 }
35738
35739 SDValue HOp = N->getOperand(0);
35740 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
35741 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
35742 return SDValue();
35743
35744 // 128-bit horizontal math instructions are defined to operate on adjacent
35745 // lanes of each operand as:
35746 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
35747 // ...similarly for v2f64 and v8i16.
35748 if (!HOp.getOperand(0).isUndef() && !HOp.getOperand(1).isUndef() &&
35749 HOp.getOperand(0) != HOp.getOperand(1))
35750 return SDValue();
35751
35752 // The shuffle that we are eliminating may have allowed the horizontal op to
35753 // have an undemanded (undefined) operand. Duplicate the other (defined)
35754 // operand to ensure that the results are defined across all lanes without the
35755 // shuffle.
35756 auto updateHOp = [](SDValue HorizOp, SelectionDAG &DAG) {
35757 SDValue X;
35758 if (HorizOp.getOperand(0).isUndef()) {
35759 assert(!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op")((!HorizOp.getOperand(1).isUndef() && "Not expecting foldable h-op"
) ? static_cast<void> (0) : __assert_fail ("!HorizOp.getOperand(1).isUndef() && \"Not expecting foldable h-op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35759, __PRETTY_FUNCTION__))
;
35760 X = HorizOp.getOperand(1);
35761 } else if (HorizOp.getOperand(1).isUndef()) {
35762 assert(!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op")((!HorizOp.getOperand(0).isUndef() && "Not expecting foldable h-op"
) ? static_cast<void> (0) : __assert_fail ("!HorizOp.getOperand(0).isUndef() && \"Not expecting foldable h-op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35762, __PRETTY_FUNCTION__))
;
35763 X = HorizOp.getOperand(0);
35764 } else {
35765 return HorizOp;
35766 }
35767 return DAG.getNode(HorizOp.getOpcode(), SDLoc(HorizOp),
35768 HorizOp.getValueType(), X, X);
35769 };
35770
35771 // When the operands of a horizontal math op are identical, the low half of
35772 // the result is the same as the high half. If a target shuffle is also
35773 // replicating low and high halves (and without changing the type/length of
35774 // the vector), we don't need the shuffle.
35775 if (Opcode == X86ISD::MOVDDUP || Opcode == X86ISD::VBROADCAST) {
35776 if (HOp.getScalarValueSizeInBits() == 64 && HOp.getValueType() == VT) {
35777 // movddup (hadd X, X) --> hadd X, X
35778 // broadcast (extract_vec_elt (hadd X, X), 0) --> hadd X, X
35779 assert((HOp.getValueType() == MVT::v2f64 ||(((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT
::v4f64) && "Unexpected type for h-op") ? static_cast
<void> (0) : __assert_fail ("(HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && \"Unexpected type for h-op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35780, __PRETTY_FUNCTION__))
35780 HOp.getValueType() == MVT::v4f64) && "Unexpected type for h-op")(((HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT
::v4f64) && "Unexpected type for h-op") ? static_cast
<void> (0) : __assert_fail ("(HOp.getValueType() == MVT::v2f64 || HOp.getValueType() == MVT::v4f64) && \"Unexpected type for h-op\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35780, __PRETTY_FUNCTION__))
;
35781 return updateHOp(HOp, DAG);
35782 }
35783 return SDValue();
35784 }
35785
35786 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
35787 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
35788 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
35789 // but this should be tied to whatever horizontal op matching and shuffle
35790 // canonicalization are producing.
35791 if (HOp.getValueSizeInBits() == 128 &&
35792 (isTargetShuffleEquivalent(Mask, {0, 0}) ||
35793 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
35794 isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
35795 return updateHOp(HOp, DAG);
35796
35797 if (HOp.getValueSizeInBits() == 256 &&
35798 (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
35799 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
35800 isTargetShuffleEquivalent(
35801 Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
35802 return updateHOp(HOp, DAG);
35803
35804 return SDValue();
35805}
35806
35807/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
35808/// low half of each source vector and does not set any high half elements in
35809/// the destination vector, narrow the shuffle to half its original size.
35810static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {
35811 if (!Shuf->getValueType(0).isSimple())
35812 return SDValue();
35813 MVT VT = Shuf->getSimpleValueType(0);
35814 if (!VT.is256BitVector() && !VT.is512BitVector())
35815 return SDValue();
35816
35817 // See if we can ignore all of the high elements of the shuffle.
35818 ArrayRef<int> Mask = Shuf->getMask();
35819 if (!isUndefUpperHalf(Mask))
35820 return SDValue();
35821
35822 // Check if the shuffle mask accesses only the low half of each input vector
35823 // (half-index output is 0 or 2).
35824 int HalfIdx1, HalfIdx2;
35825 SmallVector<int, 8> HalfMask(Mask.size() / 2);
35826 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
35827 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
35828 return SDValue();
35829
35830 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
35831 // The trick is knowing that all of the insert/extract are actually free
35832 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
35833 // of narrow inputs into a narrow output, and that is always cheaper than
35834 // the wide shuffle that we started with.
35835 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
35836 Shuf->getOperand(1), HalfMask, HalfIdx1,
35837 HalfIdx2, false, DAG, /*UseConcat*/true);
35838}
35839
35840static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
35841 TargetLowering::DAGCombinerInfo &DCI,
35842 const X86Subtarget &Subtarget) {
35843 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
35844 if (SDValue V = narrowShuffle(Shuf, DAG))
35845 return V;
35846
35847 // If we have legalized the vector types, look for blends of FADD and FSUB
35848 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
35849 SDLoc dl(N);
35850 EVT VT = N->getValueType(0);
35851 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35852 if (TLI.isTypeLegal(VT)) {
35853 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
35854 return AddSub;
35855
35856 if (SDValue HAddSub = foldShuffleOfHorizOp(N, DAG))
35857 return HAddSub;
35858 }
35859
35860 // Attempt to combine into a vector load/broadcast.
35861 if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
35862 return LD;
35863
35864 // For AVX2, we sometimes want to combine
35865 // (vector_shuffle <mask> (concat_vectors t1, undef)
35866 // (concat_vectors t2, undef))
35867 // Into:
35868 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
35869 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
35870 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
35871 return ShufConcat;
35872
35873 if (isTargetShuffle(N->getOpcode())) {
35874 SDValue Op(N, 0);
35875 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
35876 return Shuffle;
35877
35878 // Try recursively combining arbitrary sequences of x86 shuffle
35879 // instructions into higher-order shuffles. We do this after combining
35880 // specific PSHUF instruction sequences into their minimal form so that we
35881 // can evaluate how many specialized shuffle instructions are involved in
35882 // a particular chain.
35883 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
35884 return Res;
35885
35886 // Simplify source operands based on shuffle mask.
35887 // TODO - merge this into combineX86ShufflesRecursively.
35888 APInt KnownUndef, KnownZero;
35889 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
35890 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
35891 return SDValue(N, 0);
35892 }
35893
35894 // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
35895 // in the upper 64 bits.
35896 // TODO: Can we generalize this using computeKnownBits.
35897 if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
35898 (VT == MVT::v2f64 || VT == MVT::v2i64) &&
35899 N->getOperand(0).getOpcode() == ISD::BITCAST) {
35900 SDValue In = N->getOperand(0).getOperand(0);
35901 EVT InVT = In.getValueType();
35902 switch (In.getOpcode()) {
35903 default:
35904 break;
35905 case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
35906 case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
35907 case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
35908 case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
35909 case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
35910 case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
35911 case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
35912 if ((InVT == MVT::v4f32 || InVT == MVT::v4i32) &&
35913 (In.getOperand(0).getValueType() == MVT::v2f64 ||
35914 In.getOperand(0).getValueType() == MVT::v2i64))
35915 return N->getOperand(0); // return the bitcast
35916 break;
35917 case X86ISD::STRICT_CVTTP2SI:
35918 case X86ISD::STRICT_CVTTP2UI:
35919 case X86ISD::STRICT_CVTSI2P:
35920 case X86ISD::STRICT_CVTUI2P:
35921 case X86ISD::STRICT_VFPROUND:
35922 if ((InVT == MVT::v4f32 || InVT == MVT::v4i32) &&
35923 (In.getOperand(1).getValueType() == MVT::v2f64 ||
35924 In.getOperand(1).getValueType() == MVT::v2i64))
35925 return N->getOperand(0); // return the bitcast
35926 break;
35927 case X86ISD::CVTPS2PH:
35928 case X86ISD::MCVTPS2PH:
35929 if (InVT == MVT::v8i16 && In.getOperand(0).getValueType() == MVT::v4f32)
35930 return N->getOperand(0); // return the bitcast
35931 break;
35932 case X86ISD::STRICT_CVTPS2PH:
35933 if (InVT == MVT::v8i16 && In.getOperand(1).getValueType() == MVT::v4f32)
35934 return N->getOperand(0); // return the bitcast
35935 break;
35936 }
35937 }
35938
35939 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
35940 // insert into a zero vector. This helps get VZEXT_MOVL closer to
35941 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
35942 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
35943 if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
35944 N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
35945 N->getOperand(0).hasOneUse() &&
35946 N->getOperand(0).getOperand(0).isUndef() &&
35947 isNullConstant(N->getOperand(0).getOperand(2))) {
35948 SDValue In = N->getOperand(0).getOperand(1);
35949 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
35950 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
35951 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
35952 Movl, N->getOperand(0).getOperand(2));
35953 }
35954
35955 // If this a vzmovl of a full vector load, replace it with a vzload, unless
35956 // the load is volatile.
35957 if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
35958 ISD::isNormalLoad(N->getOperand(0).getNode())) {
35959 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
35960 if (LN->isSimple()) {
35961 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35962 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35963 SDValue VZLoad =
35964 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
35965 VT.getVectorElementType(),
35966 LN->getPointerInfo(),
35967 LN->getAlignment(),
35968 LN->getMemOperand()->getFlags());
35969 DCI.CombineTo(N, VZLoad);
35970 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
35971 DCI.recursivelyDeleteUnusedNodes(LN);
35972 return SDValue(N, 0);
35973 }
35974 }
35975
35976 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast and
35977 // can just use a VZEXT_LOAD.
35978 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
35979 if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
35980 N->getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD) {
35981 auto *LN = cast<MemSDNode>(N->getOperand(0));
35982 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
35983 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
35984 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
35985 SDValue VZLoad =
35986 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
35987 LN->getMemoryVT(), LN->getMemOperand());
35988 DCI.CombineTo(N, VZLoad);
35989 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
35990 DCI.recursivelyDeleteUnusedNodes(LN);
35991 return SDValue(N, 0);
35992 }
35993 }
35994
35995 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
35996 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
35997 // if the upper bits of the i64 are zero.
35998 if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
35999 N->getOperand(0)->getOpcode() == ISD::SCALAR_TO_VECTOR &&
36000 N->getOperand(0).getOperand(0).hasOneUse() &&
36001 N->getOperand(0).getOperand(0).getValueType() == MVT::i64) {
36002 SDValue In = N->getOperand(0).getOperand(0);
36003 APInt Mask = APInt::getHighBitsSet(64, 32);
36004 if (DAG.MaskedValueIsZero(In, Mask)) {
36005 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, In);
36006 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
36007 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Trunc);
36008 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, VecVT, SclVec);
36009 return DAG.getBitcast(VT, Movl);
36010 }
36011 }
36012
36013 return SDValue();
36014}
36015
36016bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
36017 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
36018 TargetLoweringOpt &TLO, unsigned Depth) const {
36019 int NumElts = DemandedElts.getBitWidth();
36020 unsigned Opc = Op.getOpcode();
36021 EVT VT = Op.getValueType();
36022
36023 // Handle special case opcodes.
36024 switch (Opc) {
36025 case X86ISD::PMULDQ:
36026 case X86ISD::PMULUDQ: {
36027 APInt LHSUndef, LHSZero;
36028 APInt RHSUndef, RHSZero;
36029 SDValue LHS = Op.getOperand(0);
36030 SDValue RHS = Op.getOperand(1);
36031 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
36032 Depth + 1))
36033 return true;
36034 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
36035 Depth + 1))
36036 return true;
36037 // Multiply by zero.
36038 KnownZero = LHSZero | RHSZero;
36039 break;
36040 }
36041 case X86ISD::VSHL:
36042 case X86ISD::VSRL:
36043 case X86ISD::VSRA: {
36044 // We only need the bottom 64-bits of the (128-bit) shift amount.
36045 SDValue Amt = Op.getOperand(1);
36046 MVT AmtVT = Amt.getSimpleValueType();
36047 assert(AmtVT.is128BitVector() && "Unexpected value type")((AmtVT.is128BitVector() && "Unexpected value type") ?
static_cast<void> (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36047, __PRETTY_FUNCTION__))
;
36048
36049 // If we reuse the shift amount just for sse shift amounts then we know that
36050 // only the bottom 64-bits are only ever used.
36051 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
36052 unsigned UseOpc = Use->getOpcode();
36053 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
36054 UseOpc == X86ISD::VSRA) &&
36055 Use->getOperand(0) != Amt;
36056 });
36057
36058 APInt AmtUndef, AmtZero;
36059 unsigned NumAmtElts = AmtVT.getVectorNumElements();
36060 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
36061 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
36062 Depth + 1, AssumeSingleUse))
36063 return true;
36064 LLVM_FALLTHROUGH[[gnu::fallthrough]];
36065 }
36066 case X86ISD::VSHLI:
36067 case X86ISD::VSRLI:
36068 case X86ISD::VSRAI: {
36069 SDValue Src = Op.getOperand(0);
36070 APInt SrcUndef;
36071 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
36072 Depth + 1))
36073 return true;
36074 // TODO convert SrcUndef to KnownUndef.
36075 break;
36076 }
36077 case X86ISD::KSHIFTL: {
36078 SDValue Src = Op.getOperand(0);
36079 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
36080 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"
) ? static_cast<void> (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36080, __PRETTY_FUNCTION__))
;
36081 unsigned ShiftAmt = Amt->getZExtValue();
36082
36083 if (ShiftAmt == 0)
36084 return TLO.CombineTo(Op, Src);
36085
36086 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
36087 // single shift. We can do this if the bottom bits (which are shifted
36088 // out) are never demanded.
36089 if (Src.getOpcode() == X86ISD::KSHIFTR) {
36090 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
36091 unsigned C1 = Src.getConstantOperandVal(1);
36092 unsigned NewOpc = X86ISD::KSHIFTL;
36093 int Diff = ShiftAmt - C1;
36094 if (Diff < 0) {
36095 Diff = -Diff;
36096 NewOpc = X86ISD::KSHIFTR;
36097 }
36098
36099 SDLoc dl(Op);
36100 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
36101 return TLO.CombineTo(
36102 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
36103 }
36104 }
36105
36106 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
36107 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
36108 Depth + 1))
36109 return true;
36110
36111 KnownUndef <<= ShiftAmt;
36112 KnownZero <<= ShiftAmt;
36113 KnownZero.setLowBits(ShiftAmt);
36114 break;
36115 }
36116 case X86ISD::KSHIFTR: {
36117 SDValue Src = Op.getOperand(0);
36118 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
36119 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")((Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount"
) ? static_cast<void> (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36119, __PRETTY_FUNCTION__))
;
36120 unsigned ShiftAmt = Amt->getZExtValue();
36121
36122 if (ShiftAmt == 0)
36123 return TLO.CombineTo(Op, Src);
36124
36125 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
36126 // single shift. We can do this if the top bits (which are shifted
36127 // out) are never demanded.
36128 if (Src.getOpcode() == X86ISD::KSHIFTL) {
36129 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
36130 unsigned C1 = Src.getConstantOperandVal(1);
36131 unsigned NewOpc = X86ISD::KSHIFTR;
36132 int Diff = ShiftAmt - C1;
36133 if (Diff < 0) {
36134 Diff = -Diff;
36135 NewOpc = X86ISD::KSHIFTL;
36136 }
36137
36138 SDLoc dl(Op);
36139 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
36140 return TLO.CombineTo(
36141 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
36142 }
36143 }
36144
36145 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
36146 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
36147 Depth + 1))
36148 return true;
36149
36150 KnownUndef.lshrInPlace(ShiftAmt);
36151 KnownZero.lshrInPlace(ShiftAmt);
36152 KnownZero.setHighBits(ShiftAmt);
36153 break;
36154 }
36155 case X86ISD::CVTSI2P:
36156 case X86ISD::CVTUI2P: {
36157 SDValue Src = Op.getOperand(0);
36158 MVT SrcVT = Src.getSimpleValueType();
36159 APInt SrcUndef, SrcZero;
36160 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
36161 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
36162 Depth + 1))
36163 return true;
36164 break;
36165 }
36166 case X86ISD::PACKSS:
36167 case X86ISD::PACKUS: {
36168 SDValue N0 = Op.getOperand(0);
36169 SDValue N1 = Op.getOperand(1);
36170
36171 APInt DemandedLHS, DemandedRHS;
36172 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36173
36174 APInt SrcUndef, SrcZero;
36175 if (SimplifyDemandedVectorElts(N0, DemandedLHS, SrcUndef, SrcZero, TLO,
36176 Depth + 1))
36177 return true;
36178 if (SimplifyDemandedVectorElts(N1, DemandedRHS, SrcUndef, SrcZero, TLO,
36179 Depth + 1))
36180 return true;
36181
36182 // Aggressively peek through ops to get at the demanded elts.
36183 // TODO - we should do this for all target/faux shuffles ops.
36184 if (!DemandedElts.isAllOnesValue()) {
36185 APInt DemandedSrcBits =
36186 APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
36187 SDValue NewN0 = SimplifyMultipleUseDemandedBits(
36188 N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);
36189 SDValue NewN1 = SimplifyMultipleUseDemandedBits(
36190 N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);
36191 if (NewN0 || NewN1) {
36192 NewN0 = NewN0 ? NewN0 : N0;
36193 NewN1 = NewN1 ? NewN1 : N1;
36194 return TLO.CombineTo(Op,
36195 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
36196 }
36197 }
36198 break;
36199 }
36200 case X86ISD::HADD:
36201 case X86ISD::HSUB:
36202 case X86ISD::FHADD:
36203 case X86ISD::FHSUB: {
36204 APInt DemandedLHS, DemandedRHS;
36205 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36206
36207 APInt LHSUndef, LHSZero;
36208 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, LHSUndef,
36209 LHSZero, TLO, Depth + 1))
36210 return true;
36211 APInt RHSUndef, RHSZero;
36212 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, RHSUndef,
36213 RHSZero, TLO, Depth + 1))
36214 return true;
36215 break;
36216 }
36217 case X86ISD::VTRUNC:
36218 case X86ISD::VTRUNCS:
36219 case X86ISD::VTRUNCUS: {
36220 SDValue Src = Op.getOperand(0);
36221 MVT SrcVT = Src.getSimpleValueType();
36222 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
36223 APInt SrcUndef, SrcZero;
36224 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
36225 Depth + 1))
36226 return true;
36227 KnownZero = SrcZero.zextOrTrunc(NumElts);
36228 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
36229 break;
36230 }
36231 case X86ISD::BLENDV: {
36232 APInt SelUndef, SelZero;
36233 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
36234 SelZero, TLO, Depth + 1))
36235 return true;
36236
36237 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
36238 APInt LHSUndef, LHSZero;
36239 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
36240 LHSZero, TLO, Depth + 1))
36241 return true;
36242
36243 APInt RHSUndef, RHSZero;
36244 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
36245 RHSZero, TLO, Depth + 1))
36246 return true;
36247
36248 KnownZero = LHSZero & RHSZero;
36249 KnownUndef = LHSUndef & RHSUndef;
36250 break;
36251 }
36252 case X86ISD::VBROADCAST: {
36253 SDValue Src = Op.getOperand(0);
36254 MVT SrcVT = Src.getSimpleValueType();
36255 if (!SrcVT.isVector())
36256 return false;
36257 // Don't bother broadcasting if we just need the 0'th element.
36258 if (DemandedElts == 1) {
36259 if (Src.getValueType() != VT)
36260 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
36261 SDLoc(Op));
36262 return TLO.CombineTo(Op, Src);
36263 }
36264 APInt SrcUndef, SrcZero;
36265 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
36266 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
36267 Depth + 1))
36268 return true;
36269 break;
36270 }
36271 case X86ISD::VPERMV: {
36272 SDValue Mask = Op.getOperand(0);
36273 APInt MaskUndef, MaskZero;
36274 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
36275 Depth + 1))
36276 return true;
36277 break;
36278 }
36279 case X86ISD::PSHUFB:
36280 case X86ISD::VPERMV3:
36281 case X86ISD::VPERMILPV: {
36282 SDValue Mask = Op.getOperand(1);
36283 APInt MaskUndef, MaskZero;
36284 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
36285 Depth + 1))
36286 return true;
36287 break;
36288 }
36289 case X86ISD::VPPERM:
36290 case X86ISD::VPERMIL2: {
36291 SDValue Mask = Op.getOperand(2);
36292 APInt MaskUndef, MaskZero;
36293 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
36294 Depth + 1))
36295 return true;
36296 break;
36297 }
36298 }
36299
36300 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
36301 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
36302 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
36303 if ((VT.is256BitVector() || VT.is512BitVector()) &&
36304 DemandedElts.lshr(NumElts / 2) == 0) {
36305 unsigned SizeInBits = VT.getSizeInBits();
36306 unsigned ExtSizeInBits = SizeInBits / 2;
36307
36308 // See if 512-bit ops only use the bottom 128-bits.
36309 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
36310 ExtSizeInBits = SizeInBits / 4;
36311
36312 switch (Opc) {
36313 // Zero upper elements.
36314 case X86ISD::VZEXT_MOVL: {
36315 SDLoc DL(Op);
36316 SDValue Ext0 =
36317 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
36318 SDValue ExtOp =
36319 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
36320 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
36321 SDValue Insert =
36322 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
36323 return TLO.CombineTo(Op, Insert);
36324 }
36325 // Subvector broadcast.
36326 case X86ISD::SUBV_BROADCAST: {
36327 SDLoc DL(Op);
36328 SDValue Src = Op.getOperand(0);
36329 if (Src.getValueSizeInBits() > ExtSizeInBits)
36330 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
36331 else if (Src.getValueSizeInBits() < ExtSizeInBits) {
36332 MVT SrcSVT = Src.getSimpleValueType().getScalarType();
36333 MVT SrcVT =
36334 MVT::getVectorVT(SrcSVT, ExtSizeInBits / SrcSVT.getSizeInBits());
36335 Src = TLO.DAG.getNode(X86ISD::SUBV_BROADCAST, DL, SrcVT, Src);
36336 }
36337 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0,
36338 TLO.DAG, DL, ExtSizeInBits));
36339 }
36340 // Byte shifts by immediate.
36341 case X86ISD::VSHLDQ:
36342 case X86ISD::VSRLDQ:
36343 // Shift by uniform.
36344 case X86ISD::VSHL:
36345 case X86ISD::VSRL:
36346 case X86ISD::VSRA:
36347 // Shift by immediate.
36348 case X86ISD::VSHLI:
36349 case X86ISD::VSRLI:
36350 case X86ISD::VSRAI: {
36351 SDLoc DL(Op);
36352 SDValue Ext0 =
36353 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
36354 SDValue ExtOp =
36355 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
36356 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
36357 SDValue Insert =
36358 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
36359 return TLO.CombineTo(Op, Insert);
36360 }
36361 case X86ISD::VPERMI: {
36362 // Simplify PERMPD/PERMQ to extract_subvector.
36363 // TODO: This should be done in shuffle combining.
36364 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
36365 SmallVector<int, 4> Mask;
36366 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
36367 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
36368 SDLoc DL(Op);
36369 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
36370 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
36371 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
36372 return TLO.CombineTo(Op, Insert);
36373 }
36374 }
36375 break;
36376 }
36377 // Target Shuffles.
36378 case X86ISD::PSHUFB:
36379 case X86ISD::UNPCKL:
36380 case X86ISD::UNPCKH:
36381 // Saturated Packs.
36382 case X86ISD::PACKSS:
36383 case X86ISD::PACKUS:
36384 // Horizontal Ops.
36385 case X86ISD::HADD:
36386 case X86ISD::HSUB:
36387 case X86ISD::FHADD:
36388 case X86ISD::FHSUB: {
36389 SDLoc DL(Op);
36390 MVT ExtVT = VT.getSimpleVT();
36391 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
36392 ExtSizeInBits / ExtVT.getScalarSizeInBits());
36393 SDValue Ext0 =
36394 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
36395 SDValue Ext1 =
36396 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
36397 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
36398 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
36399 SDValue Insert =
36400 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
36401 return TLO.CombineTo(Op, Insert);
36402 }
36403 }
36404 }
36405
36406 // Get target/faux shuffle mask.
36407 APInt OpUndef, OpZero;
36408 SmallVector<int, 64> OpMask;
36409 SmallVector<SDValue, 2> OpInputs;
36410 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
36411 OpZero, TLO.DAG, Depth, false))
36412 return false;
36413
36414 // Shuffle inputs must be the same size as the result.
36415 if (OpMask.size() != (unsigned)NumElts ||
36416 llvm::any_of(OpInputs, [VT](SDValue V) {
36417 return VT.getSizeInBits() != V.getValueSizeInBits() ||
36418 !V.getValueType().isVector();
36419 }))
36420 return false;
36421
36422 KnownZero = OpZero;
36423 KnownUndef = OpUndef;
36424
36425 // Check if shuffle mask can be simplified to undef/zero/identity.
36426 int NumSrcs = OpInputs.size();
36427 for (int i = 0; i != NumElts; ++i)
36428 if (!DemandedElts[i])
36429 OpMask[i] = SM_SentinelUndef;
36430
36431 if (isUndefInRange(OpMask, 0, NumElts)) {
36432 KnownUndef.setAllBits();
36433 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
36434 }
36435 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
36436 KnownZero.setAllBits();
36437 return TLO.CombineTo(
36438 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
36439 }
36440 for (int Src = 0; Src != NumSrcs; ++Src)
36441 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
36442 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
36443
36444 // Attempt to simplify inputs.
36445 for (int Src = 0; Src != NumSrcs; ++Src) {
36446 // TODO: Support inputs of different types.
36447 if (OpInputs[Src].getValueType() != VT)
36448 continue;
36449
36450 int Lo = Src * NumElts;
36451 APInt SrcElts = APInt::getNullValue(NumElts);
36452 for (int i = 0; i != NumElts; ++i)
36453 if (DemandedElts[i]) {
36454 int M = OpMask[i] - Lo;
36455 if (0 <= M && M < NumElts)
36456 SrcElts.setBit(M);
36457 }
36458
36459 // TODO - Propagate input undef/zero elts.
36460 APInt SrcUndef, SrcZero;
36461 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
36462 TLO, Depth + 1))
36463 return true;
36464 }
36465
36466 // If we don't demand all elements, then attempt to combine to a simpler
36467 // shuffle.
36468 // TODO: Handle other depths, but first we need to handle the fact that
36469 // it might combine to the same shuffle.
36470 if (!DemandedElts.isAllOnesValue() && Depth == 0) {
36471 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
36472 for (int i = 0; i != NumElts; ++i)
36473 if (DemandedElts[i])
36474 DemandedMask[i] = i;
36475
36476 SDValue NewShuffle = combineX86ShufflesRecursively(
36477 {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,
36478 /*AllowVarMask*/ true, TLO.DAG, Subtarget);
36479 if (NewShuffle)
36480 return TLO.CombineTo(Op, NewShuffle);
36481 }
36482
36483 return false;
36484}
36485
36486bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
36487 SDValue Op, const APInt &OriginalDemandedBits,
36488 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
36489 unsigned Depth) const {
36490 EVT VT = Op.getValueType();
36491 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
36492 unsigned Opc = Op.getOpcode();
36493 switch(Opc) {
36494 case X86ISD::PMULDQ:
36495 case X86ISD::PMULUDQ: {
36496 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
36497 KnownBits KnownOp;
36498 SDValue LHS = Op.getOperand(0);
36499 SDValue RHS = Op.getOperand(1);
36500 // FIXME: Can we bound this better?
36501 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
36502 if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
36503 TLO, Depth + 1))
36504 return true;
36505 if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
36506 TLO, Depth + 1))
36507 return true;
36508
36509 // Aggressively peek through ops to get at the demanded low bits.
36510 SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(
36511 LHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
36512 SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(
36513 RHS, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
36514 if (DemandedLHS || DemandedRHS) {
36515 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
36516 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
36517 return TLO.CombineTo(
36518 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
36519 }
36520 break;
36521 }
36522 case X86ISD::VSHLI: {
36523 SDValue Op0 = Op.getOperand(0);
36524
36525 unsigned ShAmt = Op.getConstantOperandVal(1);
36526 if (ShAmt >= BitWidth)
36527 break;
36528
36529 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
36530
36531 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
36532 // single shift. We can do this if the bottom bits (which are shifted
36533 // out) are never demanded.
36534 if (Op0.getOpcode() == X86ISD::VSRLI &&
36535 OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
36536 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
36537 if (Shift2Amt < BitWidth) {
36538 int Diff = ShAmt - Shift2Amt;
36539 if (Diff == 0)
36540 return TLO.CombineTo(Op, Op0.getOperand(0));
36541
36542 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
36543 SDValue NewShift = TLO.DAG.getNode(
36544 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
36545 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
36546 return TLO.CombineTo(Op, NewShift);
36547 }
36548 }
36549
36550 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
36551 TLO, Depth + 1))
36552 return true;
36553
36554 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36554, __PRETTY_FUNCTION__))
;
36555 Known.Zero <<= ShAmt;
36556 Known.One <<= ShAmt;
36557
36558 // Low bits known zero.
36559 Known.Zero.setLowBits(ShAmt);
36560 break;
36561 }
36562 case X86ISD::VSRLI: {
36563 unsigned ShAmt = Op.getConstantOperandVal(1);
36564 if (ShAmt >= BitWidth)
36565 break;
36566
36567 APInt DemandedMask = OriginalDemandedBits << ShAmt;
36568
36569 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
36570 OriginalDemandedElts, Known, TLO, Depth + 1))
36571 return true;
36572
36573 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36573, __PRETTY_FUNCTION__))
;
36574 Known.Zero.lshrInPlace(ShAmt);
36575 Known.One.lshrInPlace(ShAmt);
36576
36577 // High bits known zero.
36578 Known.Zero.setHighBits(ShAmt);
36579 break;
36580 }
36581 case X86ISD::VSRAI: {
36582 SDValue Op0 = Op.getOperand(0);
36583 SDValue Op1 = Op.getOperand(1);
36584
36585 unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
36586 if (ShAmt >= BitWidth)
36587 break;
36588
36589 APInt DemandedMask = OriginalDemandedBits << ShAmt;
36590
36591 // If we just want the sign bit then we don't need to shift it.
36592 if (OriginalDemandedBits.isSignMask())
36593 return TLO.CombineTo(Op, Op0);
36594
36595 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
36596 if (Op0.getOpcode() == X86ISD::VSHLI &&
36597 Op.getOperand(1) == Op0.getOperand(1)) {
36598 SDValue Op00 = Op0.getOperand(0);
36599 unsigned NumSignBits =
36600 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
36601 if (ShAmt < NumSignBits)
36602 return TLO.CombineTo(Op, Op00);
36603 }
36604
36605 // If any of the demanded bits are produced by the sign extension, we also
36606 // demand the input sign bit.
36607 if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
36608 DemandedMask.setSignBit();
36609
36610 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
36611 TLO, Depth + 1))
36612 return true;
36613
36614 assert(!Known.hasConflict() && "Bits known to be one AND zero?")((!Known.hasConflict() && "Bits known to be one AND zero?"
) ? static_cast<void> (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36614, __PRETTY_FUNCTION__))
;
36615 Known.Zero.lshrInPlace(ShAmt);
36616 Known.One.lshrInPlace(ShAmt);
36617
36618 // If the input sign bit is known to be zero, or if none of the top bits
36619 // are demanded, turn this into an unsigned shift right.
36620 if (Known.Zero[BitWidth - ShAmt - 1] ||
36621 OriginalDemandedBits.countLeadingZeros() >= ShAmt)
36622 return TLO.CombineTo(
36623 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
36624
36625 // High bits are known one.
36626 if (Known.One[BitWidth - ShAmt - 1])
36627 Known.One.setHighBits(ShAmt);
36628 break;
36629 }
36630 case X86ISD::PEXTRB:
36631 case X86ISD::PEXTRW: {
36632 SDValue Vec = Op.getOperand(0);
36633 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
36634 MVT VecVT = Vec.getSimpleValueType();
36635 unsigned NumVecElts = VecVT.getVectorNumElements();
36636
36637 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
36638 unsigned Idx = CIdx->getZExtValue();
36639 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
36640
36641 // If we demand no bits from the vector then we must have demanded
36642 // bits from the implict zext - simplify to zero.
36643 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
36644 if (DemandedVecBits == 0)
36645 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
36646
36647 APInt KnownUndef, KnownZero;
36648 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
36649 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
36650 KnownZero, TLO, Depth + 1))
36651 return true;
36652
36653 KnownBits KnownVec;
36654 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
36655 KnownVec, TLO, Depth + 1))
36656 return true;
36657
36658 if (SDValue V = SimplifyMultipleUseDemandedBits(
36659 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
36660 return TLO.CombineTo(
36661 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
36662
36663 Known = KnownVec.zext(BitWidth);
36664 return false;
36665 }
36666 break;
36667 }
36668 case X86ISD::PINSRB:
36669 case X86ISD::PINSRW: {
36670 SDValue Vec = Op.getOperand(0);
36671 SDValue Scl = Op.getOperand(1);
36672 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
36673 MVT VecVT = Vec.getSimpleValueType();
36674
36675 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
36676 unsigned Idx = CIdx->getZExtValue();
36677 if (!OriginalDemandedElts[Idx])
36678 return TLO.CombineTo(Op, Vec);
36679
36680 KnownBits KnownVec;
36681 APInt DemandedVecElts(OriginalDemandedElts);
36682 DemandedVecElts.clearBit(Idx);
36683 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
36684 KnownVec, TLO, Depth + 1))
36685 return true;
36686
36687 KnownBits KnownScl;
36688 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
36689 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
36690 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
36691 return true;
36692
36693 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
36694 Known.One = KnownVec.One & KnownScl.One;
36695 Known.Zero = KnownVec.Zero & KnownScl.Zero;
36696 return false;
36697 }
36698 break;
36699 }
36700 case X86ISD::PACKSS:
36701 // PACKSS saturates to MIN/MAX integer values. So if we just want the
36702 // sign bit then we can just ask for the source operands sign bit.
36703 // TODO - add known bits handling.
36704 if (OriginalDemandedBits.isSignMask()) {
36705 APInt DemandedLHS, DemandedRHS;
36706 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
36707
36708 KnownBits KnownLHS, KnownRHS;
36709 APInt SignMask = APInt::getSignMask(BitWidth * 2);
36710 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
36711 KnownLHS, TLO, Depth + 1))
36712 return true;
36713 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
36714 KnownRHS, TLO, Depth + 1))
36715 return true;
36716
36717 // Attempt to avoid multi-use ops if we don't need anything from them.
36718 SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
36719 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
36720 SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
36721 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
36722 if (DemandedOp0 || DemandedOp1) {
36723 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
36724 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
36725 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
36726 }
36727 }
36728 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
36729 break;
36730 case X86ISD::PCMPGT:
36731 // icmp sgt(0, R) == ashr(R, BitWidth-1).
36732 // iff we only need the sign bit then we can use R directly.
36733 if (OriginalDemandedBits.isSignMask() &&
36734 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
36735 return TLO.CombineTo(Op, Op.getOperand(1));
36736 break;
36737 case X86ISD::MOVMSK: {
36738 SDValue Src = Op.getOperand(0);
36739 MVT SrcVT = Src.getSimpleValueType();
36740 unsigned SrcBits = SrcVT.getScalarSizeInBits();
36741 unsigned NumElts = SrcVT.getVectorNumElements();
36742
36743 // If we don't need the sign bits at all just return zero.
36744 if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
36745 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
36746
36747 // Only demand the vector elements of the sign bits we need.
36748 APInt KnownUndef, KnownZero;
36749 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
36750 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
36751 TLO, Depth + 1))
36752 return true;
36753
36754 Known.Zero = KnownZero.zextOrSelf(BitWidth);
36755 Known.Zero.setHighBits(BitWidth - NumElts);
36756
36757 // MOVMSK only uses the MSB from each vector element.
36758 KnownBits KnownSrc;
36759 if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
36760 KnownSrc, TLO, Depth + 1))
36761 return true;
36762
36763 if (KnownSrc.One[SrcBits - 1])
36764 Known.One.setLowBits(NumElts);
36765 else if (KnownSrc.Zero[SrcBits - 1])
36766 Known.Zero.setLowBits(NumElts);
36767 return false;
36768 }
36769 case X86ISD::BEXTR: {
36770 SDValue Op0 = Op.getOperand(0);
36771 SDValue Op1 = Op.getOperand(1);
36772
36773 // Only bottom 16-bits of the control bits are required.
36774 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
36775 // NOTE: SimplifyDemandedBits won't do this for constants.
36776 const APInt &Val1 = Cst1->getAPIntValue();
36777 APInt MaskedVal1 = Val1 & 0xFFFF;
36778 if (MaskedVal1 != Val1) {
36779 SDLoc DL(Op);
36780 return TLO.CombineTo(
36781 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
36782 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
36783 }
36784 }
36785
36786 KnownBits Known1;
36787 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
36788 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
36789 return true;
36790
36791 // If the length is 0, replace with 0.
36792 KnownBits LengthBits = Known1.extractBits(8, 8);
36793 if (LengthBits.isZero())
36794 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
36795
36796 break;
36797 }
36798 }
36799
36800 return TargetLowering::SimplifyDemandedBitsForTargetNode(
36801 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
36802}
36803
36804SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
36805 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
36806 SelectionDAG &DAG, unsigned Depth) const {
36807 int NumElts = DemandedElts.getBitWidth();
36808 unsigned Opc = Op.getOpcode();
36809 EVT VT = Op.getValueType();
36810
36811 switch (Opc) {
36812 case X86ISD::PINSRB:
36813 case X86ISD::PINSRW: {
36814 // If we don't demand the inserted element, return the base vector.
36815 SDValue Vec = Op.getOperand(0);
36816 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
36817 MVT VecVT = Vec.getSimpleValueType();
36818 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
36819 !DemandedElts[CIdx->getZExtValue()])
36820 return Vec;
36821 break;
36822 }
36823 case X86ISD::PCMPGT:
36824 // icmp sgt(0, R) == ashr(R, BitWidth-1).
36825 // iff we only need the sign bit then we can use R directly.
36826 if (DemandedBits.isSignMask() &&
36827 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
36828 return Op.getOperand(1);
36829 break;
36830 }
36831
36832 APInt ShuffleUndef, ShuffleZero;
36833 SmallVector<int, 16> ShuffleMask;
36834 SmallVector<SDValue, 2> ShuffleOps;
36835 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
36836 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
36837 // If all the demanded elts are from one operand and are inline,
36838 // then we can use the operand directly.
36839 int NumOps = ShuffleOps.size();
36840 if (ShuffleMask.size() == (unsigned)NumElts &&
36841 llvm::all_of(ShuffleOps, [VT](SDValue V) {
36842 return VT.getSizeInBits() == V.getValueSizeInBits();
36843 })) {
36844
36845 if (DemandedElts.isSubsetOf(ShuffleUndef))
36846 return DAG.getUNDEF(VT);
36847 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
36848 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
36849
36850 // Bitmask that indicates which ops have only been accessed 'inline'.
36851 APInt IdentityOp = APInt::getAllOnesValue(NumOps);
36852 for (int i = 0; i != NumElts; ++i) {
36853 int M = ShuffleMask[i];
36854 if (!DemandedElts[i] || ShuffleUndef[i])
36855 continue;
36856 int OpIdx = M / NumElts;
36857 int EltIdx = M % NumElts;
36858 if (M < 0 || EltIdx != i) {
36859 IdentityOp.clearAllBits();
36860 break;
36861 }
36862 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
36863 if (IdentityOp == 0)
36864 break;
36865 }
36866 assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected") ? static_cast<void
> (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36867, __PRETTY_FUNCTION__))
36867 "Multiple identity shuffles detected")(((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&
"Multiple identity shuffles detected") ? static_cast<void
> (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36867, __PRETTY_FUNCTION__))
;
36868
36869 if (IdentityOp != 0)
36870 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
36871 }
36872 }
36873
36874 return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
36875 Op, DemandedBits, DemandedElts, DAG, Depth);
36876}
36877
36878// Helper to peek through bitops/setcc to determine size of source vector.
36879// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
36880static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
36881 switch (Src.getOpcode()) {
36882 case ISD::SETCC:
36883 return Src.getOperand(0).getValueSizeInBits() == Size;
36884 case ISD::AND:
36885 case ISD::XOR:
36886 case ISD::OR:
36887 return checkBitcastSrcVectorSize(Src.getOperand(0), Size) &&
36888 checkBitcastSrcVectorSize(Src.getOperand(1), Size);
36889 }
36890 return false;
36891}
36892
36893// Helper to push sign extension of vXi1 SETCC result through bitops.
36894static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
36895 SDValue Src, const SDLoc &DL) {
36896 switch (Src.getOpcode()) {
36897 case ISD::SETCC:
36898 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
36899 case ISD::AND:
36900 case ISD::XOR:
36901 case ISD::OR:
36902 return DAG.getNode(
36903 Src.getOpcode(), DL, SExtVT,
36904 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
36905 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
36906 }
36907 llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36907)
;
36908}
36909
36910// Try to match patterns such as
36911// (i16 bitcast (v16i1 x))
36912// ->
36913// (i16 movmsk (16i8 sext (v16i1 x)))
36914// before the illegal vector is scalarized on subtargets that don't have legal
36915// vxi1 types.
36916static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
36917 const SDLoc &DL,
36918 const X86Subtarget &Subtarget) {
36919 EVT SrcVT = Src.getValueType();
36920 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
36921 return SDValue();
36922
36923 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
36924 // movmskb even with avx512. This will be better than truncating to vXi1 and
36925 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
36926 // vpcmpeqb/vpcmpgtb.
36927 bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
36928 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
36929 Src.getOperand(0).getValueType() == MVT::v32i8 ||
36930 Src.getOperand(0).getValueType() == MVT::v64i8);
36931
36932 // With AVX512 vxi1 types are legal and we prefer using k-regs.
36933 // MOVMSK is supported in SSE2 or later.
36934 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated))
36935 return SDValue();
36936
36937 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
36938 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
36939 // v8i16 and v16i16.
36940 // For these two cases, we can shuffle the upper element bytes to a
36941 // consecutive sequence at the start of the vector and treat the results as
36942 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
36943 // for v16i16 this is not the case, because the shuffle is expensive, so we
36944 // avoid sign-extending to this type entirely.
36945 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
36946 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
36947 MVT SExtVT;
36948 bool PropagateSExt = false;
36949 switch (SrcVT.getSimpleVT().SimpleTy) {
36950 default:
36951 return SDValue();
36952 case MVT::v2i1:
36953 SExtVT = MVT::v2i64;
36954 break;
36955 case MVT::v4i1:
36956 SExtVT = MVT::v4i32;
36957 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
36958 // sign-extend to a 256-bit operation to avoid truncation.
36959 if (Subtarget.hasAVX() && checkBitcastSrcVectorSize(Src, 256)) {
36960 SExtVT = MVT::v4i64;
36961 PropagateSExt = true;
36962 }
36963 break;
36964 case MVT::v8i1:
36965 SExtVT = MVT::v8i16;
36966 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
36967 // sign-extend to a 256-bit operation to match the compare.
36968 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
36969 // 256-bit because the shuffle is cheaper than sign extending the result of
36970 // the compare.
36971 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256) ||
36972 checkBitcastSrcVectorSize(Src, 512))) {
36973 SExtVT = MVT::v8i32;
36974 PropagateSExt = true;
36975 }
36976 break;
36977 case MVT::v16i1:
36978 SExtVT = MVT::v16i8;
36979 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
36980 // it is not profitable to sign-extend to 256-bit because this will
36981 // require an extra cross-lane shuffle which is more expensive than
36982 // truncating the result of the compare to 128-bits.
36983 break;
36984 case MVT::v32i1:
36985 SExtVT = MVT::v32i8;
36986 break;
36987 case MVT::v64i1:
36988 // If we have AVX512F, but not AVX512BW and the input is truncated from
36989 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
36990 if (Subtarget.hasAVX512()) {
36991 if (Subtarget.hasBWI())
36992 return SDValue();
36993 SExtVT = MVT::v64i8;
36994 break;
36995 }
36996 // Split if this is a <64 x i8> comparison result.
36997 if (checkBitcastSrcVectorSize(Src, 512)) {
36998 SExtVT = MVT::v64i8;
36999 break;
37000 }
37001 return SDValue();
37002 };
37003
37004 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
37005 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
37006
37007 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
37008 V = getPMOVMSKB(DL, V, DAG, Subtarget);
37009 } else {
37010 if (SExtVT == MVT::v8i16)
37011 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
37012 DAG.getUNDEF(MVT::v8i16));
37013 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
37014 }
37015
37016 EVT IntVT =
37017 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
37018 V = DAG.getZExtOrTrunc(V, DL, IntVT);
37019 return DAG.getBitcast(VT, V);
37020}
37021
37022// Convert a vXi1 constant build vector to the same width scalar integer.
37023static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
37024 EVT SrcVT = Op.getValueType();
37025 assert(SrcVT.getVectorElementType() == MVT::i1 &&((SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"
) ? static_cast<void> (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37026, __PRETTY_FUNCTION__))
37026 "Expected a vXi1 vector")((SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector"
) ? static_cast<void> (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37026, __PRETTY_FUNCTION__))
;
37027 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector") ? static_cast<void>
(0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37028, __PRETTY_FUNCTION__))
37028 "Expected a constant build vector")((ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector") ? static_cast<void>
(0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37028, __PRETTY_FUNCTION__))
;
37029
37030 APInt Imm(SrcVT.getVectorNumElements(), 0);
37031 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
37032 SDValue In = Op.getOperand(Idx);
37033 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
37034 Imm.setBit(Idx);
37035 }
37036 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
37037 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
37038}
37039
37040static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
37041 TargetLowering::DAGCombinerInfo &DCI,
37042 const X86Subtarget &Subtarget) {
37043 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")((N->getOpcode() == ISD::BITCAST && "Expected a bitcast"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37043, __PRETTY_FUNCTION__))
;
37044
37045 if (!DCI.isBeforeLegalizeOps())
37046 return SDValue();
37047
37048 // Only do this if we have k-registers.
37049 if (!Subtarget.hasAVX512())
37050 return SDValue();
37051
37052 EVT DstVT = N->getValueType(0);
37053 SDValue Op = N->getOperand(0);
37054 EVT SrcVT = Op.getValueType();
37055
37056 if (!Op.hasOneUse())
37057 return SDValue();
37058
37059 // Look for logic ops.
37060 if (Op.getOpcode() != ISD::AND &&
37061 Op.getOpcode() != ISD::OR &&
37062 Op.getOpcode() != ISD::XOR)
37063 return SDValue();
37064
37065 // Make sure we have a bitcast between mask registers and a scalar type.
37066 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
37067 DstVT.isScalarInteger()) &&
37068 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
37069 SrcVT.isScalarInteger()))
37070 return SDValue();
37071
37072 SDValue LHS = Op.getOperand(0);
37073 SDValue RHS = Op.getOperand(1);
37074
37075 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
37076 LHS.getOperand(0).getValueType() == DstVT)
37077 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
37078 DAG.getBitcast(DstVT, RHS));
37079
37080 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
37081 RHS.getOperand(0).getValueType() == DstVT)
37082 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
37083 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
37084
37085 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
37086 // Most of these have to move a constant from the scalar domain anyway.
37087 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
37088 RHS = combinevXi1ConstantToInteger(RHS, DAG);
37089 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
37090 DAG.getBitcast(DstVT, LHS), RHS);
37091 }
37092
37093 return SDValue();
37094}
37095
37096static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
37097 const X86Subtarget &Subtarget) {
37098 SDLoc DL(BV);
37099 unsigned NumElts = BV->getNumOperands();
37100 SDValue Splat = BV->getSplatValue();
37101
37102 // Build MMX element from integer GPR or SSE float values.
37103 auto CreateMMXElement = [&](SDValue V) {
37104 if (V.isUndef())
37105 return DAG.getUNDEF(MVT::x86mmx);
37106 if (V.getValueType().isFloatingPoint()) {
37107 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
37108 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
37109 V = DAG.getBitcast(MVT::v2i64, V);
37110 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
37111 }
37112 V = DAG.getBitcast(MVT::i32, V);
37113 } else {
37114 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
37115 }
37116 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
37117 };
37118
37119 // Convert build vector ops to MMX data in the bottom elements.
37120 SmallVector<SDValue, 8> Ops;
37121
37122 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
37123 if (Splat) {
37124 if (Splat.isUndef())
37125 return DAG.getUNDEF(MVT::x86mmx);
37126
37127 Splat = CreateMMXElement(Splat);
37128
37129 if (Subtarget.hasSSE1()) {
37130 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
37131 if (NumElts == 8)
37132 Splat = DAG.getNode(
37133 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
37134 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
37135 Splat);
37136
37137 // Use PSHUFW to repeat 16-bit elements.
37138 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
37139 return DAG.getNode(
37140 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
37141 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32),
37142 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
37143 }
37144 Ops.append(NumElts, Splat);
37145 } else {
37146 for (unsigned i = 0; i != NumElts; ++i)
37147 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
37148 }
37149
37150 // Use tree of PUNPCKLs to build up general MMX vector.
37151 while (Ops.size() > 1) {
37152 unsigned NumOps = Ops.size();
37153 unsigned IntrinOp =
37154 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
37155 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
37156 : Intrinsic::x86_mmx_punpcklbw));
37157 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
37158 for (unsigned i = 0; i != NumOps; i += 2)
37159 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
37160 Ops[i], Ops[i + 1]);
37161 Ops.resize(NumOps / 2);
37162 }
37163
37164 return Ops[0];
37165}
37166
37167static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
37168 TargetLowering::DAGCombinerInfo &DCI,
37169 const X86Subtarget &Subtarget) {
37170 SDValue N0 = N->getOperand(0);
37171 EVT VT = N->getValueType(0);
37172 EVT SrcVT = N0.getValueType();
37173
37174 // Try to match patterns such as
37175 // (i16 bitcast (v16i1 x))
37176 // ->
37177 // (i16 movmsk (16i8 sext (v16i1 x)))
37178 // before the setcc result is scalarized on subtargets that don't have legal
37179 // vxi1 types.
37180 if (DCI.isBeforeLegalize()) {
37181 SDLoc dl(N);
37182 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
37183 return V;
37184
37185 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
37186 // legalization destroys the v4i32 type.
37187 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
37188 VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
37189 N0.getOperand(0).getValueType() == MVT::v4i32 &&
37190 ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
37191 cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
37192 SDValue N00 = N0.getOperand(0);
37193 // Only do this if we can avoid scalarizing the input.
37194 if (ISD::isNormalLoad(N00.getNode()) ||
37195 (N00.getOpcode() == ISD::BITCAST &&
37196 N00.getOperand(0).getValueType() == MVT::v4f32)) {
37197 SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
37198 DAG.getBitcast(MVT::v4f32, N00));
37199 return DAG.getZExtOrTrunc(V, dl, VT);
37200 }
37201 }
37202
37203 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
37204 // type, widen both sides to avoid a trip through memory.
37205 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
37206 Subtarget.hasAVX512()) {
37207 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
37208 N0 = DAG.getBitcast(MVT::v8i1, N0);
37209 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
37210 DAG.getIntPtrConstant(0, dl));
37211 }
37212
37213 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
37214 // type, widen both sides to avoid a trip through memory.
37215 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
37216 Subtarget.hasAVX512()) {
37217 // Use zeros for the widening if we already have some zeroes. This can
37218 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
37219 // stream of this.
37220 // FIXME: It might make sense to detect a concat_vectors with a mix of
37221 // zeroes and undef and turn it into insert_subvector for i1 vectors as
37222 // a separate combine. What we can't do is canonicalize the operands of
37223 // such a concat or we'll get into a loop with SimplifyDemandedBits.
37224 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
37225 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
37226 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
37227 SrcVT = LastOp.getValueType();
37228 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
37229 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
37230 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
37231 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
37232 N0 = DAG.getBitcast(MVT::i8, N0);
37233 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
37234 }
37235 }
37236
37237 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
37238 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
37239 Ops[0] = N0;
37240 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
37241 N0 = DAG.getBitcast(MVT::i8, N0);
37242 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
37243 }
37244 }
37245
37246 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
37247 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
37248 // due to insert_subvector legalization on KNL. By promoting the copy to i16
37249 // we can help with known bits propagation from the vXi1 domain to the
37250 // scalar domain.
37251 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
37252 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
37253 N0.getOperand(0).getValueType() == MVT::v16i1 &&
37254 isNullConstant(N0.getOperand(1)))
37255 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
37256 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
37257
37258 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
37259 // and the vbroadcast_load are both integer or both fp. In some cases this
37260 // will remove the bitcast entirely.
37261 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
37262 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
37263 auto *BCast = cast<MemIntrinsicSDNode>(N0);
37264 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
37265 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
37266 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
37267 : MVT::getIntegerVT(MemSize);
37268 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
37269 : MVT::getIntegerVT(SrcVTSize);
37270 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
37271
37272 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
37273 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
37274 SDValue ResNode =
37275 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
37276 MemVT, BCast->getMemOperand());
37277 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
37278 return DAG.getBitcast(VT, ResNode);
37279 }
37280
37281 // Since MMX types are special and don't usually play with other vector types,
37282 // it's better to handle them early to be sure we emit efficient code by
37283 // avoiding store-load conversions.
37284 if (VT == MVT::x86mmx) {
37285 // Detect MMX constant vectors.
37286 APInt UndefElts;
37287 SmallVector<APInt, 1> EltBits;
37288 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
37289 SDLoc DL(N0);
37290 // Handle zero-extension of i32 with MOVD.
37291 if (EltBits[0].countLeadingZeros() >= 32)
37292 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
37293 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
37294 // Else, bitcast to a double.
37295 // TODO - investigate supporting sext 32-bit immediates on x86_64.
37296 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
37297 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
37298 }
37299
37300 // Detect bitcasts to x86mmx low word.
37301 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
37302 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
37303 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
37304 bool LowUndef = true, AllUndefOrZero = true;
37305 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
37306 SDValue Op = N0.getOperand(i);
37307 LowUndef &= Op.isUndef() || (i >= e/2);
37308 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
37309 }
37310 if (AllUndefOrZero) {
37311 SDValue N00 = N0.getOperand(0);
37312 SDLoc dl(N00);
37313 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
37314 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
37315 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
37316 }
37317 }
37318
37319 // Detect bitcasts of 64-bit build vectors and convert to a
37320 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
37321 // lowest element.
37322 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
37323 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
37324 SrcVT == MVT::v8i8))
37325 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
37326
37327 // Detect bitcasts between element or subvector extraction to x86mmx.
37328 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
37329 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
37330 isNullConstant(N0.getOperand(1))) {
37331 SDValue N00 = N0.getOperand(0);
37332 if (N00.getValueType().is128BitVector())
37333 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
37334 DAG.getBitcast(MVT::v2i64, N00));
37335 }
37336
37337 // Detect bitcasts from FP_TO_SINT to x86mmx.
37338 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
37339 SDLoc DL(N0);
37340 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
37341 DAG.getUNDEF(MVT::v2i32));
37342 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
37343 DAG.getBitcast(MVT::v2i64, Res));
37344 }
37345 }
37346
37347 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
37348 // most of these to scalar anyway.
37349 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
37350 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
37351 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
37352 return combinevXi1ConstantToInteger(N0, DAG);
37353 }
37354
37355 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
37356 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
37357 isa<ConstantSDNode>(N0)) {
37358 auto *C = cast<ConstantSDNode>(N0);
37359 if (C->isAllOnesValue())
37360 return DAG.getConstant(1, SDLoc(N0), VT);
37361 if (C->isNullValue())
37362 return DAG.getConstant(0, SDLoc(N0), VT);
37363 }
37364
37365 // Try to remove bitcasts from input and output of mask arithmetic to
37366 // remove GPR<->K-register crossings.
37367 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
37368 return V;
37369
37370 // Convert a bitcasted integer logic operation that has one bitcasted
37371 // floating-point operand into a floating-point logic operation. This may
37372 // create a load of a constant, but that is cheaper than materializing the
37373 // constant in an integer register and transferring it to an SSE register or
37374 // transferring the SSE operand to integer register and back.
37375 unsigned FPOpcode;
37376 switch (N0.getOpcode()) {
37377 case ISD::AND: FPOpcode = X86ISD::FAND; break;
37378 case ISD::OR: FPOpcode = X86ISD::FOR; break;
37379 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
37380 default: return SDValue();
37381 }
37382
37383 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
37384 (Subtarget.hasSSE2() && VT == MVT::f64)))
37385 return SDValue();
37386
37387 SDValue LogicOp0 = N0.getOperand(0);
37388 SDValue LogicOp1 = N0.getOperand(1);
37389 SDLoc DL0(N0);
37390
37391 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
37392 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
37393 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
37394 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
37395 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
37396 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
37397 }
37398 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
37399 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
37400 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
37401 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
37402 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
37403 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
37404 }
37405
37406 return SDValue();
37407}
37408
37409// Given a ABS node, detect the following pattern:
37410// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
37411// This is useful as it is the input into a SAD pattern.
37412static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
37413 SDValue AbsOp1 = Abs->getOperand(0);
37414 if (AbsOp1.getOpcode() != ISD::SUB)
37415 return false;
37416
37417 Op0 = AbsOp1.getOperand(0);
37418 Op1 = AbsOp1.getOperand(1);
37419
37420 // Check if the operands of the sub are zero-extended from vectors of i8.
37421 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
37422 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
37423 Op1.getOpcode() != ISD::ZERO_EXTEND ||
37424 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
37425 return false;
37426
37427 return true;
37428}
37429
37430// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
37431// to these zexts.
37432static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
37433 const SDValue &Zext1, const SDLoc &DL,
37434 const X86Subtarget &Subtarget) {
37435 // Find the appropriate width for the PSADBW.
37436 EVT InVT = Zext0.getOperand(0).getValueType();
37437 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
37438
37439 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
37440 // fill in the missing vector elements with 0.
37441 unsigned NumConcat = RegSize / InVT.getSizeInBits();
37442 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
37443 Ops[0] = Zext0.getOperand(0);
37444 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
37445 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
37446 Ops[0] = Zext1.getOperand(0);
37447 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
37448
37449 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
37450 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
37451 ArrayRef<SDValue> Ops) {
37452 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
37453 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
37454 };
37455 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
37456 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
37457 PSADBWBuilder);
37458}
37459
37460// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
37461// PHMINPOSUW.
37462static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
37463 const X86Subtarget &Subtarget) {
37464 // Bail without SSE41.
37465 if (!Subtarget.hasSSE41())
37466 return SDValue();
37467
37468 EVT ExtractVT = Extract->getValueType(0);
37469 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
37470 return SDValue();
37471
37472 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
37473 ISD::NodeType BinOp;
37474 SDValue Src = DAG.matchBinOpReduction(
37475 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
37476 if (!Src)
37477 return SDValue();
37478
37479 EVT SrcVT = Src.getValueType();
37480 EVT SrcSVT = SrcVT.getScalarType();
37481 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
37482 return SDValue();
37483
37484 SDLoc DL(Extract);
37485 SDValue MinPos = Src;
37486
37487 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
37488 while (SrcVT.getSizeInBits() > 128) {
37489 unsigned NumElts = SrcVT.getVectorNumElements();
37490 unsigned NumSubElts = NumElts / 2;
37491 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
37492 unsigned SubSizeInBits = SrcVT.getSizeInBits();
37493 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
37494 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
37495 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
37496 }
37497 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37499, __PRETTY_FUNCTION__))
37498 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37499, __PRETTY_FUNCTION__))
37499 "Unexpected value type")((((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (
SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
"Unexpected value type") ? static_cast<void> (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37499, __PRETTY_FUNCTION__))
;
37500
37501 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
37502 // to flip the value accordingly.
37503 SDValue Mask;
37504 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
37505 if (BinOp == ISD::SMAX)
37506 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
37507 else if (BinOp == ISD::SMIN)
37508 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
37509 else if (BinOp == ISD::UMAX)
37510 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
37511
37512 if (Mask)
37513 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
37514
37515 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
37516 // shuffling each upper element down and insert zeros. This means that the
37517 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
37518 // ready for the PHMINPOS.
37519 if (ExtractVT == MVT::i8) {
37520 SDValue Upper = DAG.getVectorShuffle(
37521 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
37522 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
37523 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
37524 }
37525
37526 // Perform the PHMINPOS on a v8i16 vector,
37527 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
37528 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
37529 MinPos = DAG.getBitcast(SrcVT, MinPos);
37530
37531 if (Mask)
37532 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
37533
37534 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
37535 DAG.getIntPtrConstant(0, DL));
37536}
37537
37538// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
37539static SDValue combineHorizontalPredicateResult(SDNode *Extract,
37540 SelectionDAG &DAG,
37541 const X86Subtarget &Subtarget) {
37542 // Bail without SSE2.
37543 if (!Subtarget.hasSSE2())
37544 return SDValue();
37545
37546 EVT ExtractVT = Extract->getValueType(0);
37547 unsigned BitWidth = ExtractVT.getSizeInBits();
37548 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
37549 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
37550 return SDValue();
37551
37552 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
37553 ISD::NodeType BinOp;
37554 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
37555 if (!Match && ExtractVT == MVT::i1)
37556 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
37557 if (!Match)
37558 return SDValue();
37559
37560 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
37561 // which we can't support here for now.
37562 if (Match.getScalarValueSizeInBits() != BitWidth)
37563 return SDValue();
37564
37565 SDValue Movmsk;
37566 SDLoc DL(Extract);
37567 EVT MatchVT = Match.getValueType();
37568 unsigned NumElts = MatchVT.getVectorNumElements();
37569 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
37570 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37571
37572 if (ExtractVT == MVT::i1) {
37573 // Special case for (pre-legalization) vXi1 reductions.
37574 if (NumElts > 64 || !isPowerOf2_32(NumElts))
37575 return SDValue();
37576 if (TLI.isTypeLegal(MatchVT)) {
37577 // If this is a legal AVX512 predicate type then we can just bitcast.
37578 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
37579 Movmsk = DAG.getBitcast(MovmskVT, Match);
37580 } else {
37581 // Use combineBitcastvxi1 to create the MOVMSK.
37582 while (NumElts > MaxElts) {
37583 SDValue Lo, Hi;
37584 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
37585 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
37586 NumElts /= 2;
37587 }
37588 EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
37589 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
37590 }
37591 if (!Movmsk)
37592 return SDValue();
37593 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
37594 } else {
37595 // FIXME: Better handling of k-registers or 512-bit vectors?
37596 unsigned MatchSizeInBits = Match.getValueSizeInBits();
37597 if (!(MatchSizeInBits == 128 ||
37598 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
37599 return SDValue();
37600
37601 // Make sure this isn't a vector of 1 element. The perf win from using
37602 // MOVMSK diminishes with less elements in the reduction, but it is
37603 // generally better to get the comparison over to the GPRs as soon as
37604 // possible to reduce the number of vector ops.
37605 if (Match.getValueType().getVectorNumElements() < 2)
37606 return SDValue();
37607
37608 // Check that we are extracting a reduction of all sign bits.
37609 if (DAG.ComputeNumSignBits(Match) != BitWidth)
37610 return SDValue();
37611
37612 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
37613 SDValue Lo, Hi;
37614 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
37615 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
37616 MatchSizeInBits = Match.getValueSizeInBits();
37617 }
37618
37619 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
37620 MVT MaskSrcVT;
37621 if (64 == BitWidth || 32 == BitWidth)
37622 MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
37623 MatchSizeInBits / BitWidth);
37624 else
37625 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
37626
37627 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
37628 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
37629 NumElts = MaskSrcVT.getVectorNumElements();
37630 }
37631 assert((NumElts <= 32 || NumElts == 64) &&(((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37632, __PRETTY_FUNCTION__))
37632 "Not expecting more than 64 elements")(((NumElts <= 32 || NumElts == 64) && "Not expecting more than 64 elements"
) ? static_cast<void> (0) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37632, __PRETTY_FUNCTION__))
;
37633
37634 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
37635 if (BinOp == ISD::XOR) {
37636 // parity -> (AND (CTPOP(MOVMSK X)), 1)
37637 SDValue Mask = DAG.getConstant(1, DL, CmpVT);
37638 SDValue Result = DAG.getNode(ISD::CTPOP, DL, CmpVT, Movmsk);
37639 Result = DAG.getNode(ISD::AND, DL, CmpVT, Result, Mask);
37640 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
37641 }
37642
37643 SDValue CmpC;
37644 ISD::CondCode CondCode;
37645 if (BinOp == ISD::OR) {
37646 // any_of -> MOVMSK != 0
37647 CmpC = DAG.getConstant(0, DL, CmpVT);
37648 CondCode = ISD::CondCode::SETNE;
37649 } else {
37650 // all_of -> MOVMSK == ((1 << NumElts) - 1)
37651 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
37652 DL, CmpVT);
37653 CondCode = ISD::CondCode::SETEQ;
37654 }
37655
37656 // The setcc produces an i8 of 0/1, so extend that to the result width and
37657 // negate to get the final 0/-1 mask value.
37658 EVT SetccVT =
37659 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
37660 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
37661 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
37662 SDValue Zero = DAG.getConstant(0, DL, ExtractVT);
37663 return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
37664}
37665
37666static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
37667 const X86Subtarget &Subtarget) {
37668 // PSADBW is only supported on SSE2 and up.
37669 if (!Subtarget.hasSSE2())
37670 return SDValue();
37671
37672 // Verify the type we're extracting from is any integer type above i16.
37673 EVT VT = Extract->getOperand(0).getValueType();
37674 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
37675 return SDValue();
37676
37677 unsigned RegSize = 128;
37678 if (Subtarget.useBWIRegs())
37679 RegSize = 512;
37680 else if (Subtarget.hasAVX())
37681 RegSize = 256;
37682
37683 // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
37684 // TODO: We should be able to handle larger vectors by splitting them before
37685 // feeding them into several SADs, and then reducing over those.
37686 if (RegSize / VT.getVectorNumElements() < 8)
37687 return SDValue();
37688
37689 // Match shuffle + add pyramid.
37690 ISD::NodeType BinOp;
37691 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
37692
37693 // The operand is expected to be zero extended from i8
37694 // (verified in detectZextAbsDiff).
37695 // In order to convert to i64 and above, additional any/zero/sign
37696 // extend is expected.
37697 // The zero extend from 32 bit has no mathematical effect on the result.
37698 // Also the sign extend is basically zero extend
37699 // (extends the sign bit which is zero).
37700 // So it is correct to skip the sign/zero extend instruction.
37701 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
37702 Root.getOpcode() == ISD::ZERO_EXTEND ||
37703 Root.getOpcode() == ISD::ANY_EXTEND))
37704 Root = Root.getOperand(0);
37705
37706 // If there was a match, we want Root to be a select that is the root of an
37707 // abs-diff pattern.
37708 if (!Root || Root.getOpcode() != ISD::ABS)
37709 return SDValue();
37710
37711 // Check whether we have an abs-diff pattern feeding into the select.
37712 SDValue Zext0, Zext1;
37713 if (!detectZextAbsDiff(Root, Zext0, Zext1))
37714 return SDValue();
37715
37716 // Create the SAD instruction.
37717 SDLoc DL(Extract);
37718 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
37719
37720 // If the original vector was wider than 8 elements, sum over the results
37721 // in the SAD vector.
37722 unsigned Stages = Log2_32(VT.getVectorNumElements());
37723 MVT SadVT = SAD.getSimpleValueType();
37724 if (Stages > 3) {
37725 unsigned SadElems = SadVT.getVectorNumElements();
37726
37727 for(unsigned i = Stages - 3; i > 0; --i) {
37728 SmallVector<int, 16> Mask(SadElems, -1);
37729 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
37730 Mask[j] = MaskEnd + j;
37731
37732 SDValue Shuffle =
37733 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
37734 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
37735 }
37736 }
37737
37738 MVT Type = Extract->getSimpleValueType(0);
37739 unsigned TypeSizeInBits = Type.getSizeInBits();
37740 // Return the lowest TypeSizeInBits bits.
37741 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
37742 SAD = DAG.getBitcast(ResVT, SAD);
37743 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
37744 Extract->getOperand(1));
37745}
37746
37747// Attempt to peek through a target shuffle and extract the scalar from the
37748// source.
37749static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
37750 TargetLowering::DAGCombinerInfo &DCI,
37751 const X86Subtarget &Subtarget) {
37752 if (DCI.isBeforeLegalizeOps())
37753 return SDValue();
37754
37755 SDLoc dl(N);
37756 SDValue Src = N->getOperand(0);
37757 SDValue Idx = N->getOperand(1);
37758
37759 EVT VT = N->getValueType(0);
37760 EVT SrcVT = Src.getValueType();
37761 EVT SrcSVT = SrcVT.getVectorElementType();
37762 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37763
37764 // Don't attempt this for boolean mask vectors or unknown extraction indices.
37765 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
37766 return SDValue();
37767
37768 const APInt &IdxC = N->getConstantOperandAPInt(1);
37769 if (IdxC.uge(NumSrcElts))
37770 return SDValue();
37771
37772 SDValue SrcBC = peekThroughBitcasts(Src);
37773
37774 // Handle extract(bitcast(broadcast(scalar_value))).
37775 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
37776 SDValue SrcOp = SrcBC.getOperand(0);
37777 if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
37778 return DAG.getBitcast(VT, SrcOp);
37779
37780 EVT SrcOpVT = SrcOp.getValueType();
37781 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
37782 (SrcOpVT.getSizeInBits() % SrcSVT.getSizeInBits()) == 0) {
37783 unsigned Scale = SrcOpVT.getSizeInBits() / SrcSVT.getSizeInBits();
37784 unsigned Offset = IdxC.urem(Scale) * SrcSVT.getSizeInBits();
37785 // TODO support non-zero offsets.
37786 if (Offset == 0) {
37787 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
37788 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
37789 return SrcOp;
37790 }
37791 }
37792 }
37793
37794 // If we're extracting a single element from a broadcast load and there are
37795 // no other users, just create a single load.
37796 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
37797 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
37798 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
37799 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
37800 VT.getSizeInBits() == SrcBCWidth) {
37801 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
37802 MemIntr->getBasePtr(),
37803 MemIntr->getPointerInfo(),
37804 MemIntr->getAlignment(),
37805 MemIntr->getMemOperand()->getFlags());
37806 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
37807 return Load;
37808 }
37809 }
37810
37811 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
37812 // TODO: Move to DAGCombine?
37813 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
37814 SrcBC.getValueType().isInteger() &&
37815 (SrcBC.getScalarValueSizeInBits() % SrcSVT.getSizeInBits()) == 0 &&
37816 SrcBC.getScalarValueSizeInBits() ==
37817 SrcBC.getOperand(0).getValueSizeInBits()) {
37818 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcSVT.getSizeInBits();
37819 if (IdxC.ult(Scale)) {
37820 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
37821 SDValue Scl = SrcBC.getOperand(0);
37822 EVT SclVT = Scl.getValueType();
37823 if (Offset) {
37824 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
37825 DAG.getShiftAmountConstant(Offset, SclVT, dl));
37826 }
37827 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
37828 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
37829 return Scl;
37830 }
37831 }
37832
37833 // Handle extract(truncate(x)) for 0'th index.
37834 // TODO: Treat this as a faux shuffle?
37835 // TODO: When can we use this for general indices?
37836 if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {
37837 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
37838 Src = DAG.getBitcast(SrcVT, Src);
37839 return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
37840 }
37841
37842 // Resolve the target shuffle inputs and mask.
37843 SmallVector<int, 16> Mask;
37844 SmallVector<SDValue, 2> Ops;
37845 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
37846 return SDValue();
37847
37848 // Shuffle inputs must be the same size as the result.
37849 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
37850 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
37851 }))
37852 return SDValue();
37853
37854 // Attempt to narrow/widen the shuffle mask to the correct size.
37855 if (Mask.size() != NumSrcElts) {
37856 if ((NumSrcElts % Mask.size()) == 0) {
37857 SmallVector<int, 16> ScaledMask;
37858 int Scale = NumSrcElts / Mask.size();
37859 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
37860 Mask = std::move(ScaledMask);
37861 } else if ((Mask.size() % NumSrcElts) == 0) {
37862 // Simplify Mask based on demanded element.
37863 int ExtractIdx = (int)N->getConstantOperandVal(1);
37864 int Scale = Mask.size() / NumSrcElts;
37865 int Lo = Scale * ExtractIdx;
37866 int Hi = Scale * (ExtractIdx + 1);
37867 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
37868 if (i < Lo || Hi <= i)
37869 Mask[i] = SM_SentinelUndef;
37870
37871 SmallVector<int, 16> WidenedMask;
37872 while (Mask.size() > NumSrcElts &&
37873 canWidenShuffleElements(Mask, WidenedMask))
37874 Mask = std::move(WidenedMask);
37875 // TODO - investigate support for wider shuffle masks with known upper
37876 // undef/zero elements for implicit zero-extension.
37877 }
37878 }
37879
37880 // Check if narrowing/widening failed.
37881 if (Mask.size() != NumSrcElts)
37882 return SDValue();
37883
37884 int SrcIdx = Mask[IdxC.getZExtValue()];
37885
37886 // If the shuffle source element is undef/zero then we can just accept it.
37887 if (SrcIdx == SM_SentinelUndef)
37888 return DAG.getUNDEF(VT);
37889
37890 if (SrcIdx == SM_SentinelZero)
37891 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
37892 : DAG.getConstant(0, dl, VT);
37893
37894 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
37895 SrcIdx = SrcIdx % Mask.size();
37896
37897 // We can only extract other elements from 128-bit vectors and in certain
37898 // circumstances, depending on SSE-level.
37899 // TODO: Investigate using extract_subvector for larger vectors.
37900 // TODO: Investigate float/double extraction if it will be just stored.
37901 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
37902 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
37903 assert(SrcSVT == VT && "Unexpected extraction type")((SrcSVT == VT && "Unexpected extraction type") ? static_cast
<void> (0) : __assert_fail ("SrcSVT == VT && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37903, __PRETTY_FUNCTION__))
;
37904 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
37905 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
37906 DAG.getIntPtrConstant(SrcIdx, dl));
37907 }
37908
37909 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
37910 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
37911 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&((VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37912, __PRETTY_FUNCTION__))
37912 "Unexpected extraction type")((VT.getSizeInBits() >= SrcSVT.getSizeInBits() && "Unexpected extraction type"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() >= SrcSVT.getSizeInBits() && \"Unexpected extraction type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37912, __PRETTY_FUNCTION__))
;
37913 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
37914 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
37915 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
37916 DAG.getIntPtrConstant(SrcIdx, dl));
37917 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
37918 }
37919
37920 return SDValue();
37921}
37922
37923/// Extracting a scalar FP value from vector element 0 is free, so extract each
37924/// operand first, then perform the math as a scalar op.
37925static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
37926 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")((ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
"Expected extract") ? static_cast<void> (0) : __assert_fail
("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37926, __PRETTY_FUNCTION__))
;
37927 SDValue Vec = ExtElt->getOperand(0);
37928 SDValue Index = ExtElt->getOperand(1);
37929 EVT VT = ExtElt->getValueType(0);
37930 EVT VecVT = Vec.getValueType();
37931
37932 // TODO: If this is a unary/expensive/expand op, allow extraction from a
37933 // non-zero element because the shuffle+scalar op will be cheaper?
37934 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
37935 return SDValue();
37936
37937 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
37938 // extract, the condition code), so deal with those as a special-case.
37939 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
37940 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
37941 if (OpVT != MVT::f32 && OpVT != MVT::f64)
37942 return SDValue();
37943
37944 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
37945 SDLoc DL(ExtElt);
37946 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
37947 Vec.getOperand(0), Index);
37948 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
37949 Vec.getOperand(1), Index);
37950 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
37951 }
37952
37953 if (VT != MVT::f32 && VT != MVT::f64)
37954 return SDValue();
37955
37956 // Vector FP selects don't fit the pattern of FP math ops (because the
37957 // condition has a different type and we have to change the opcode), so deal
37958 // with those here.
37959 // FIXME: This is restricted to pre type legalization by ensuring the setcc
37960 // has i1 elements. If we loosen this we need to convert vector bool to a
37961 // scalar bool.
37962 if (Vec.getOpcode() == ISD::VSELECT &&
37963 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
37964 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
37965 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
37966 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
37967 SDLoc DL(ExtElt);
37968 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
37969 Vec.getOperand(0).getValueType().getScalarType(),
37970 Vec.getOperand(0), Index);
37971 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
37972 Vec.getOperand(1), Index);
37973 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
37974 Vec.getOperand(2), Index);
37975 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
37976 }
37977
37978 // TODO: This switch could include FNEG and the x86-specific FP logic ops
37979 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
37980 // missed load folding and fma+fneg combining.
37981 switch (Vec.getOpcode()) {
37982 case ISD::FMA: // Begin 3 operands
37983 case ISD::FMAD:
37984 case ISD::FADD: // Begin 2 operands
37985 case ISD::FSUB:
37986 case ISD::FMUL:
37987 case ISD::FDIV:
37988 case ISD::FREM:
37989 case ISD::FCOPYSIGN:
37990 case ISD::FMINNUM:
37991 case ISD::FMAXNUM:
37992 case ISD::FMINNUM_IEEE:
37993 case ISD::FMAXNUM_IEEE:
37994 case ISD::FMAXIMUM:
37995 case ISD::FMINIMUM:
37996 case X86ISD::FMAX:
37997 case X86ISD::FMIN:
37998 case ISD::FABS: // Begin 1 operand
37999 case ISD::FSQRT:
38000 case ISD::FRINT:
38001 case ISD::FCEIL:
38002 case ISD::FTRUNC:
38003 case ISD::FNEARBYINT:
38004 case ISD::FROUND:
38005 case ISD::FFLOOR:
38006 case X86ISD::FRCP:
38007 case X86ISD::FRSQRT: {
38008 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
38009 SDLoc DL(ExtElt);
38010 SmallVector<SDValue, 4> ExtOps;
38011 for (SDValue Op : Vec->ops())
38012 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
38013 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
38014 }
38015 default:
38016 return SDValue();
38017 }
38018 llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38018)
;
38019}
38020
38021/// Try to convert a vector reduction sequence composed of binops and shuffles
38022/// into horizontal ops.
38023static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
38024 const X86Subtarget &Subtarget) {
38025 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")((ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
"Unexpected caller") ? static_cast<void> (0) : __assert_fail
("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38025, __PRETTY_FUNCTION__))
;
38026
38027 // We need at least SSE2 to anything here.
38028 if (!Subtarget.hasSSE2())
38029 return SDValue();
38030
38031 ISD::NodeType Opc;
38032 SDValue Rdx =
38033 DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
38034 if (!Rdx)
38035 return SDValue();
38036
38037 SDValue Index = ExtElt->getOperand(1);
38038 assert(isNullConstant(Index) &&((isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? static_cast<void> (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38039, __PRETTY_FUNCTION__))
38039 "Reduction doesn't end in an extract from index 0")((isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? static_cast<void> (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38039, __PRETTY_FUNCTION__))
;
38040
38041 EVT VT = ExtElt->getValueType(0);
38042 EVT VecVT = Rdx.getValueType();
38043 if (VecVT.getScalarType() != VT)
38044 return SDValue();
38045
38046 SDLoc DL(ExtElt);
38047
38048 // vXi8 reduction - sub 128-bit vector.
38049 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
38050 if (VecVT == MVT::v4i8) {
38051 // Pad with zero.
38052 if (Subtarget.hasSSE41()) {
38053 Rdx = DAG.getBitcast(MVT::i32, Rdx);
38054 Rdx = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
38055 DAG.getConstant(0, DL, MVT::v4i32), Rdx,
38056 DAG.getIntPtrConstant(0, DL));
38057 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
38058 } else {
38059 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, Rdx,
38060 DAG.getConstant(0, DL, VecVT));
38061 }
38062 }
38063 if (Rdx.getValueType() == MVT::v8i8) {
38064 // Pad with undef.
38065 Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
38066 DAG.getUNDEF(MVT::v8i8));
38067 }
38068 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
38069 DAG.getConstant(0, DL, MVT::v16i8));
38070 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
38071 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
38072 }
38073
38074 // Must be a >=128-bit vector with pow2 elements.
38075 if ((VecVT.getSizeInBits() % 128) != 0 ||
38076 !isPowerOf2_32(VecVT.getVectorNumElements()))
38077 return SDValue();
38078
38079 // vXi8 reduction - sum lo/hi halves then use PSADBW.
38080 if (VT == MVT::i8) {
38081 while (Rdx.getValueSizeInBits() > 128) {
38082 unsigned HalfSize = VecVT.getSizeInBits() / 2;
38083 unsigned HalfElts = VecVT.getVectorNumElements() / 2;
38084 SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
38085 SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
38086 Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
38087 VecVT = Rdx.getValueType();
38088 }
38089 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")((VecVT == MVT::v16i8 && "v16i8 reduction expected") ?
static_cast<void> (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38089, __PRETTY_FUNCTION__))
;
38090
38091 SDValue Hi = DAG.getVectorShuffle(
38092 MVT::v16i8, DL, Rdx, Rdx,
38093 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
38094 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
38095 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
38096 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
38097 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
38098 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
38099 }
38100
38101 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
38102 bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
38103 if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
38104 return SDValue();
38105
38106 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
38107
38108 // 256-bit horizontal instructions operate on 128-bit chunks rather than
38109 // across the whole vector, so we need an extract + hop preliminary stage.
38110 // This is the only step where the operands of the hop are not the same value.
38111 // TODO: We could extend this to handle 512-bit or even longer vectors.
38112 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
38113 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
38114 unsigned NumElts = VecVT.getVectorNumElements();
38115 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
38116 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
38117 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
38118 VecVT = Rdx.getValueType();
38119 }
38120 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
38121 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
38122 return SDValue();
38123
38124 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
38125 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
38126 for (unsigned i = 0; i != ReductionSteps; ++i)
38127 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
38128
38129 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
38130}
38131
38132/// Detect vector gather/scatter index generation and convert it from being a
38133/// bunch of shuffles and extracts into a somewhat faster sequence.
38134/// For i686, the best sequence is apparently storing the value and loading
38135/// scalars back, while for x64 we should use 64-bit extracts and shifts.
38136static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
38137 TargetLowering::DAGCombinerInfo &DCI,
38138 const X86Subtarget &Subtarget) {
38139 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
38140 return NewOp;
38141
38142 SDValue InputVector = N->getOperand(0);
38143 SDValue EltIdx = N->getOperand(1);
38144 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
38145
38146 EVT SrcVT = InputVector.getValueType();
38147 EVT VT = N->getValueType(0);
38148 SDLoc dl(InputVector);
38149 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
38150 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38151
38152 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
38153 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
38154
38155 // Integer Constant Folding.
38156 if (CIdx && VT.isInteger()) {
38157 APInt UndefVecElts;
38158 SmallVector<APInt, 16> EltBits;
38159 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
38160 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
38161 EltBits, true, false)) {
38162 uint64_t Idx = CIdx->getZExtValue();
38163 if (UndefVecElts[Idx])
38164 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
38165 return DAG.getConstant(EltBits[Idx].zextOrSelf(VT.getScalarSizeInBits()),
38166 dl, VT);
38167 }
38168 }
38169
38170 if (IsPextr) {
38171 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38172 if (TLI.SimplifyDemandedBits(
38173 SDValue(N, 0), APInt::getAllOnesValue(VT.getSizeInBits()), DCI))
38174 return SDValue(N, 0);
38175
38176 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
38177 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
38178 InputVector.getOpcode() == X86ISD::PINSRW) &&
38179 InputVector.getOperand(2) == EltIdx) {
38180 assert(SrcVT == InputVector.getOperand(0).getValueType() &&((SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38181, __PRETTY_FUNCTION__))
38181 "Vector type mismatch")((SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch") ? static_cast<void> (0) : __assert_fail
("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38181, __PRETTY_FUNCTION__))
;
38182 SDValue Scl = InputVector.getOperand(1);
38183 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
38184 return DAG.getZExtOrTrunc(Scl, dl, VT);
38185 }
38186
38187 // TODO - Remove this once we can handle the implicit zero-extension of
38188 // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
38189 // combineBasicSADPattern.
38190 return SDValue();
38191 }
38192
38193 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
38194 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
38195 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
38196 SDValue MMXSrc = InputVector.getOperand(0);
38197
38198 // The bitcast source is a direct mmx result.
38199 if (MMXSrc.getValueType() == MVT::x86mmx)
38200 return DAG.getBitcast(VT, InputVector);
38201 }
38202
38203 // Detect mmx to i32 conversion through a v2i32 elt extract.
38204 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
38205 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
38206 SDValue MMXSrc = InputVector.getOperand(0);
38207
38208 // The bitcast source is a direct mmx result.
38209 if (MMXSrc.getValueType() == MVT::x86mmx)
38210 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
38211 }
38212
38213 // Check whether this extract is the root of a sum of absolute differences
38214 // pattern. This has to be done here because we really want it to happen
38215 // pre-legalization,
38216 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
38217 return SAD;
38218
38219 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
38220 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
38221 return Cmp;
38222
38223 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
38224 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
38225 return MinMax;
38226
38227 if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget))
38228 return V;
38229
38230 if (SDValue V = scalarizeExtEltFP(N, DAG))
38231 return V;
38232
38233 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
38234 // and then testing the relevant element.
38235 if (CIdx && SrcVT.getScalarType() == MVT::i1) {
38236 SmallVector<SDNode *, 16> BoolExtracts;
38237 auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
38238 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
38239 isa<ConstantSDNode>(Use->getOperand(1)) &&
38240 Use->getValueType(0) == MVT::i1) {
38241 BoolExtracts.push_back(Use);
38242 return true;
38243 }
38244 return false;
38245 };
38246 if (all_of(InputVector->uses(), IsBoolExtract) &&
38247 BoolExtracts.size() > 1) {
38248 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
38249 if (SDValue BC =
38250 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
38251 for (SDNode *Use : BoolExtracts) {
38252 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
38253 unsigned MaskIdx = Use->getConstantOperandVal(1);
38254 APInt MaskBit = APInt::getOneBitSet(NumSrcElts, MaskIdx);
38255 SDValue Mask = DAG.getConstant(MaskBit, dl, BCVT);
38256 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
38257 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
38258 DCI.CombineTo(Use, Res);
38259 }
38260 return SDValue(N, 0);
38261 }
38262 }
38263 }
38264
38265 return SDValue();
38266}
38267
38268/// If a vector select has an operand that is -1 or 0, try to simplify the
38269/// select to a bitwise logic operation.
38270/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
38271static SDValue
38272combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
38273 TargetLowering::DAGCombinerInfo &DCI,
38274 const X86Subtarget &Subtarget) {
38275 SDValue Cond = N->getOperand(0);
38276 SDValue LHS = N->getOperand(1);
38277 SDValue RHS = N->getOperand(2);
38278 EVT VT = LHS.getValueType();
38279 EVT CondVT = Cond.getValueType();
38280 SDLoc DL(N);
38281 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38282
38283 if (N->getOpcode() != ISD::VSELECT)
38284 return SDValue();
38285
38286 assert(CondVT.isVector() && "Vector select expects a vector selector!")((CondVT.isVector() && "Vector select expects a vector selector!"
) ? static_cast<void> (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38286, __PRETTY_FUNCTION__))
;
38287
38288 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
38289 // TODO: Can we assert that both operands are not zeros (because that should
38290 // get simplified at node creation time)?
38291 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
38292 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
38293
38294 // If both inputs are 0/undef, create a complete zero vector.
38295 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
38296 if (TValIsAllZeros && FValIsAllZeros) {
38297 if (VT.isFloatingPoint())
38298 return DAG.getConstantFP(0.0, DL, VT);
38299 return DAG.getConstant(0, DL, VT);
38300 }
38301
38302 // To use the condition operand as a bitwise mask, it must have elements that
38303 // are the same size as the select elements. Ie, the condition operand must
38304 // have already been promoted from the IR select condition type <N x i1>.
38305 // Don't check if the types themselves are equal because that excludes
38306 // vector floating-point selects.
38307 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
38308 return SDValue();
38309
38310 // Try to invert the condition if true value is not all 1s and false value is
38311 // not all 0s. Only do this if the condition has one use.
38312 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
38313 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
38314 // Check if the selector will be produced by CMPP*/PCMP*.
38315 Cond.getOpcode() == ISD::SETCC &&
38316 // Check if SETCC has already been promoted.
38317 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
38318 CondVT) {
38319 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
38320
38321 if (TValIsAllZeros || FValIsAllOnes) {
38322 SDValue CC = Cond.getOperand(2);
38323 ISD::CondCode NewCC = ISD::getSetCCInverse(
38324 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
38325 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
38326 NewCC);
38327 std::swap(LHS, RHS);
38328 TValIsAllOnes = FValIsAllOnes;
38329 FValIsAllZeros = TValIsAllZeros;
38330 }
38331 }
38332
38333 // Cond value must be 'sign splat' to be converted to a logical op.
38334 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
38335 return SDValue();
38336
38337 // vselect Cond, 111..., 000... -> Cond
38338 if (TValIsAllOnes && FValIsAllZeros)
38339 return DAG.getBitcast(VT, Cond);
38340
38341 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
38342 return SDValue();
38343
38344 // vselect Cond, 111..., X -> or Cond, X
38345 if (TValIsAllOnes) {
38346 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
38347 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
38348 return DAG.getBitcast(VT, Or);
38349 }
38350
38351 // vselect Cond, X, 000... -> and Cond, X
38352 if (FValIsAllZeros) {
38353 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
38354 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
38355 return DAG.getBitcast(VT, And);
38356 }
38357
38358 // vselect Cond, 000..., X -> andn Cond, X
38359 if (TValIsAllZeros) {
38360 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
38361 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
38362 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
38363 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
38364 return DAG.getBitcast(VT, AndN);
38365 }
38366
38367 return SDValue();
38368}
38369
38370/// If both arms of a vector select are concatenated vectors, split the select,
38371/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
38372/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
38373/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
38374static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
38375 const X86Subtarget &Subtarget) {
38376 unsigned Opcode = N->getOpcode();
38377 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
38378 return SDValue();
38379
38380 // TODO: Split 512-bit vectors too?
38381 EVT VT = N->getValueType(0);
38382 if (!VT.is256BitVector())
38383 return SDValue();
38384
38385 // TODO: Split as long as any 2 of the 3 operands are concatenated?
38386 SDValue Cond = N->getOperand(0);
38387 SDValue TVal = N->getOperand(1);
38388 SDValue FVal = N->getOperand(2);
38389 SmallVector<SDValue, 4> CatOpsT, CatOpsF;
38390 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
38391 !collectConcatOps(TVal.getNode(), CatOpsT) ||
38392 !collectConcatOps(FVal.getNode(), CatOpsF))
38393 return SDValue();
38394
38395 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
38396 ArrayRef<SDValue> Ops) {
38397 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
38398 };
38399 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
38400 makeBlend, /*CheckBWI*/ false);
38401}
38402
38403static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
38404 SDValue Cond = N->getOperand(0);
38405 SDValue LHS = N->getOperand(1);
38406 SDValue RHS = N->getOperand(2);
38407 SDLoc DL(N);
38408
38409 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
38410 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
38411 if (!TrueC || !FalseC)
38412 return SDValue();
38413
38414 // Don't do this for crazy integer types.
38415 EVT VT = N->getValueType(0);
38416 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
38417 return SDValue();
38418
38419 // We're going to use the condition bit in math or logic ops. We could allow
38420 // this with a wider condition value (post-legalization it becomes an i8),
38421 // but if nothing is creating selects that late, it doesn't matter.
38422 if (Cond.getValueType() != MVT::i1)
38423 return SDValue();
38424
38425 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
38426 // 3, 5, or 9 with i32/i64, so those get transformed too.
38427 // TODO: For constants that overflow or do not differ by power-of-2 or small
38428 // multiplier, convert to 'and' + 'add'.
38429 const APInt &TrueVal = TrueC->getAPIntValue();
38430 const APInt &FalseVal = FalseC->getAPIntValue();
38431 bool OV;
38432 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
38433 if (OV)
38434 return SDValue();
38435
38436 APInt AbsDiff = Diff.abs();
38437 if (AbsDiff.isPowerOf2() ||
38438 ((VT == MVT::i32 || VT == MVT::i64) &&
38439 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
38440
38441 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
38442 // of the condition can usually be folded into a compare predicate, but even
38443 // without that, the sequence should be cheaper than a CMOV alternative.
38444 if (TrueVal.slt(FalseVal)) {
38445 Cond = DAG.getNOT(DL, Cond, MVT::i1);
38446 std::swap(TrueC, FalseC);
38447 }
38448
38449 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
38450 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
38451
38452 // Multiply condition by the difference if non-one.
38453 if (!AbsDiff.isOneValue())
38454 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
38455
38456 // Add the base if non-zero.
38457 if (!FalseC->isNullValue())
38458 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
38459
38460 return R;
38461 }
38462
38463 return SDValue();
38464}
38465
38466/// If this is a *dynamic* select (non-constant condition) and we can match
38467/// this node with one of the variable blend instructions, restructure the
38468/// condition so that blends can use the high (sign) bit of each element.
38469/// This function will also call SimplifyDemandedBits on already created
38470/// BLENDV to perform additional simplifications.
38471static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
38472 TargetLowering::DAGCombinerInfo &DCI,
38473 const X86Subtarget &Subtarget) {
38474 SDValue Cond = N->getOperand(0);
38475 if ((N->getOpcode() != ISD::VSELECT &&
38476 N->getOpcode() != X86ISD::BLENDV) ||
38477 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
38478 return SDValue();
38479
38480 // Don't optimize before the condition has been transformed to a legal type
38481 // and don't ever optimize vector selects that map to AVX512 mask-registers.
38482 unsigned BitWidth = Cond.getScalarValueSizeInBits();
38483 if (BitWidth < 8 || BitWidth > 64)
38484 return SDValue();
38485
38486 // We can only handle the cases where VSELECT is directly legal on the
38487 // subtarget. We custom lower VSELECT nodes with constant conditions and
38488 // this makes it hard to see whether a dynamic VSELECT will correctly
38489 // lower, so we both check the operation's status and explicitly handle the
38490 // cases where a *dynamic* blend will fail even though a constant-condition
38491 // blend could be custom lowered.
38492 // FIXME: We should find a better way to handle this class of problems.
38493 // Potentially, we should combine constant-condition vselect nodes
38494 // pre-legalization into shuffles and not mark as many types as custom
38495 // lowered.
38496 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38497 EVT VT = N->getValueType(0);
38498 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
38499 return SDValue();
38500 // FIXME: We don't support i16-element blends currently. We could and
38501 // should support them by making *all* the bits in the condition be set
38502 // rather than just the high bit and using an i8-element blend.
38503 if (VT.getVectorElementType() == MVT::i16)
38504 return SDValue();
38505 // Dynamic blending was only available from SSE4.1 onward.
38506 if (VT.is128BitVector() && !Subtarget.hasSSE41())
38507 return SDValue();
38508 // Byte blends are only available in AVX2
38509 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
38510 return SDValue();
38511 // There are no 512-bit blend instructions that use sign bits.
38512 if (VT.is512BitVector())
38513 return SDValue();
38514
38515 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
38516 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
38517 UI != UE; ++UI)
38518 if ((UI->getOpcode() != ISD::VSELECT &&
38519 UI->getOpcode() != X86ISD::BLENDV) ||
38520 UI.getOperandNo() != 0)
38521 return false;
38522
38523 return true;
38524 };
38525
38526 if (OnlyUsedAsSelectCond(Cond)) {
38527 APInt DemandedMask(APInt::getSignMask(BitWidth));
38528 KnownBits Known;
38529 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38530 !DCI.isBeforeLegalizeOps());
38531 if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
38532 return SDValue();
38533
38534 // If we changed the computation somewhere in the DAG, this change will
38535 // affect all users of Cond. Update all the nodes so that we do not use
38536 // the generic VSELECT anymore. Otherwise, we may perform wrong
38537 // optimizations as we messed with the actual expectation for the vector
38538 // boolean values.
38539 for (SDNode *U : Cond->uses()) {
38540 if (U->getOpcode() == X86ISD::BLENDV)
38541 continue;
38542
38543 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
38544 Cond, U->getOperand(1), U->getOperand(2));
38545 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
38546 DCI.AddToWorklist(U);
38547 }
38548 DCI.CommitTargetLoweringOpt(TLO);
38549 return SDValue(N, 0);
38550 }
38551
38552 // Otherwise we can still at least try to simplify multiple use bits.
38553 APInt DemandedBits(APInt::getSignMask(BitWidth));
38554 if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
38555 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
38556 N->getOperand(1), N->getOperand(2));
38557
38558 return SDValue();
38559}
38560
38561// Try to match:
38562// (or (and (M, (sub 0, X)), (pandn M, X)))
38563// which is a special case of:
38564// (select M, (sub 0, X), X)
38565// Per:
38566// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
38567// We know that, if fNegate is 0 or 1:
38568// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
38569//
38570// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
38571// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
38572// ( M ? -X : X) == ((X ^ M ) + (M & 1))
38573// This lets us transform our vselect to:
38574// (add (xor X, M), (and M, 1))
38575// And further to:
38576// (sub (xor X, M), M)
38577static SDValue combineLogicBlendIntoConditionalNegate(
38578 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
38579 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
38580 EVT MaskVT = Mask.getValueType();
38581 assert(MaskVT.isInteger() &&((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38583, __PRETTY_FUNCTION__))
38582 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38583, __PRETTY_FUNCTION__))
38583 "Mask must be zero/all-bits")((MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) ==
MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits"
) ? static_cast<void> (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38583, __PRETTY_FUNCTION__))
;
38584
38585 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
38586 return SDValue();
38587 if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
38588 return SDValue();
38589
38590 auto IsNegV = [](SDNode *N, SDValue V) {
38591 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
38592 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
38593 };
38594
38595 SDValue V;
38596 if (IsNegV(Y.getNode(), X))
38597 V = X;
38598 else if (IsNegV(X.getNode(), Y))
38599 V = Y;
38600 else
38601 return SDValue();
38602
38603 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
38604 SDValue SubOp2 = Mask;
38605
38606 // If the negate was on the false side of the select, then
38607 // the operands of the SUB need to be swapped. PR 27251.
38608 // This is because the pattern being matched above is
38609 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
38610 // but if the pattern matched was
38611 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
38612 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
38613 // pattern also needs to be a negation of the replacement pattern above.
38614 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
38615 // sub accomplishes the negation of the replacement pattern.
38616 if (V == Y)
38617 std::swap(SubOp1, SubOp2);
38618
38619 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
38620 return DAG.getBitcast(VT, Res);
38621}
38622
38623/// Do target-specific dag combines on SELECT and VSELECT nodes.
38624static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
38625 TargetLowering::DAGCombinerInfo &DCI,
38626 const X86Subtarget &Subtarget) {
38627 SDLoc DL(N);
38628 SDValue Cond = N->getOperand(0);
38629 SDValue LHS = N->getOperand(1);
38630 SDValue RHS = N->getOperand(2);
38631
38632 // Try simplification again because we use this function to optimize
38633 // BLENDV nodes that are not handled by the generic combiner.
38634 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
38635 return V;
38636
38637 EVT VT = LHS.getValueType();
38638 EVT CondVT = Cond.getValueType();
38639 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38640 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
38641
38642 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
38643 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
38644 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
38645 if (CondVT.isVector() && CondVT.isInteger() &&
38646 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
38647 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
38648 DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
38649 if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
38650 DL, DAG, Subtarget))
38651 return V;
38652
38653 // Convert vselects with constant condition into shuffles.
38654 if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
38655 SmallVector<int, 64> Mask;
38656 if (createShuffleMaskFromVSELECT(Mask, Cond))
38657 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
38658 }
38659
38660 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
38661 // instructions match the semantics of the common C idiom x<y?x:y but not
38662 // x<=y?x:y, because of how they handle negative zero (which can be
38663 // ignored in unsafe-math mode).
38664 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
38665 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
38666 VT != MVT::f80 && VT != MVT::f128 &&
38667 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
38668 (Subtarget.hasSSE2() ||
38669 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
38670 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
38671
38672 unsigned Opcode = 0;
38673 // Check for x CC y ? x : y.
38674 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
38675 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
38676 switch (CC) {
38677 default: break;
38678 case ISD::SETULT:
38679 // Converting this to a min would handle NaNs incorrectly, and swapping
38680 // the operands would cause it to handle comparisons between positive
38681 // and negative zero incorrectly.
38682 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
38683 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38684 !(DAG.isKnownNeverZeroFloat(LHS) ||
38685 DAG.isKnownNeverZeroFloat(RHS)))
38686 break;
38687 std::swap(LHS, RHS);
38688 }
38689 Opcode = X86ISD::FMIN;
38690 break;
38691 case ISD::SETOLE:
38692 // Converting this to a min would handle comparisons between positive
38693 // and negative zero incorrectly.
38694 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38695 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
38696 break;
38697 Opcode = X86ISD::FMIN;
38698 break;
38699 case ISD::SETULE:
38700 // Converting this to a min would handle both negative zeros and NaNs
38701 // incorrectly, but we can swap the operands to fix both.
38702 std::swap(LHS, RHS);
38703 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38704 case ISD::SETOLT:
38705 case ISD::SETLT:
38706 case ISD::SETLE:
38707 Opcode = X86ISD::FMIN;
38708 break;
38709
38710 case ISD::SETOGE:
38711 // Converting this to a max would handle comparisons between positive
38712 // and negative zero incorrectly.
38713 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38714 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
38715 break;
38716 Opcode = X86ISD::FMAX;
38717 break;
38718 case ISD::SETUGT:
38719 // Converting this to a max would handle NaNs incorrectly, and swapping
38720 // the operands would cause it to handle comparisons between positive
38721 // and negative zero incorrectly.
38722 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
38723 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38724 !(DAG.isKnownNeverZeroFloat(LHS) ||
38725 DAG.isKnownNeverZeroFloat(RHS)))
38726 break;
38727 std::swap(LHS, RHS);
38728 }
38729 Opcode = X86ISD::FMAX;
38730 break;
38731 case ISD::SETUGE:
38732 // Converting this to a max would handle both negative zeros and NaNs
38733 // incorrectly, but we can swap the operands to fix both.
38734 std::swap(LHS, RHS);
38735 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38736 case ISD::SETOGT:
38737 case ISD::SETGT:
38738 case ISD::SETGE:
38739 Opcode = X86ISD::FMAX;
38740 break;
38741 }
38742 // Check for x CC y ? y : x -- a min/max with reversed arms.
38743 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
38744 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
38745 switch (CC) {
38746 default: break;
38747 case ISD::SETOGE:
38748 // Converting this to a min would handle comparisons between positive
38749 // and negative zero incorrectly, and swapping the operands would
38750 // cause it to handle NaNs incorrectly.
38751 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38752 !(DAG.isKnownNeverZeroFloat(LHS) ||
38753 DAG.isKnownNeverZeroFloat(RHS))) {
38754 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
38755 break;
38756 std::swap(LHS, RHS);
38757 }
38758 Opcode = X86ISD::FMIN;
38759 break;
38760 case ISD::SETUGT:
38761 // Converting this to a min would handle NaNs incorrectly.
38762 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
38763 break;
38764 Opcode = X86ISD::FMIN;
38765 break;
38766 case ISD::SETUGE:
38767 // Converting this to a min would handle both negative zeros and NaNs
38768 // incorrectly, but we can swap the operands to fix both.
38769 std::swap(LHS, RHS);
38770 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38771 case ISD::SETOGT:
38772 case ISD::SETGT:
38773 case ISD::SETGE:
38774 Opcode = X86ISD::FMIN;
38775 break;
38776
38777 case ISD::SETULT:
38778 // Converting this to a max would handle NaNs incorrectly.
38779 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
38780 break;
38781 Opcode = X86ISD::FMAX;
38782 break;
38783 case ISD::SETOLE:
38784 // Converting this to a max would handle comparisons between positive
38785 // and negative zero incorrectly, and swapping the operands would
38786 // cause it to handle NaNs incorrectly.
38787 if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
38788 !DAG.isKnownNeverZeroFloat(LHS) &&
38789 !DAG.isKnownNeverZeroFloat(RHS)) {
38790 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
38791 break;
38792 std::swap(LHS, RHS);
38793 }
38794 Opcode = X86ISD::FMAX;
38795 break;
38796 case ISD::SETULE:
38797 // Converting this to a max would handle both negative zeros and NaNs
38798 // incorrectly, but we can swap the operands to fix both.
38799 std::swap(LHS, RHS);
38800 LLVM_FALLTHROUGH[[gnu::fallthrough]];
38801 case ISD::SETOLT:
38802 case ISD::SETLT:
38803 case ISD::SETLE:
38804 Opcode = X86ISD::FMAX;
38805 break;
38806 }
38807 }
38808
38809 if (Opcode)
38810 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
38811 }
38812
38813 // Some mask scalar intrinsics rely on checking if only one bit is set
38814 // and implement it in C code like this:
38815 // A[0] = (U & 1) ? A[0] : W[0];
38816 // This creates some redundant instructions that break pattern matching.
38817 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
38818 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
38819 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
38820 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
38821 SDValue AndNode = Cond.getOperand(0);
38822 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
38823 isNullConstant(Cond.getOperand(1)) &&
38824 isOneConstant(AndNode.getOperand(1))) {
38825 // LHS and RHS swapped due to
38826 // setcc outputting 1 when AND resulted in 0 and vice versa.
38827 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
38828 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
38829 }
38830 }
38831
38832 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
38833 // lowering on KNL. In this case we convert it to
38834 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
38835 // The same situation all vectors of i8 and i16 without BWI.
38836 // Make sure we extend these even before type legalization gets a chance to
38837 // split wide vectors.
38838 // Since SKX these selects have a proper lowering.
38839 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
38840 CondVT.getVectorElementType() == MVT::i1 &&
38841 (VT.getVectorElementType() == MVT::i8 ||
38842 VT.getVectorElementType() == MVT::i16)) {
38843 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
38844 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
38845 }
38846
38847 // AVX512 - Extend select with zero to merge with target shuffle.
38848 // select(mask, extract_subvector(shuffle(x)), zero) -->
38849 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
38850 // TODO - support non target shuffles as well.
38851 if (Subtarget.hasAVX512() && CondVT.isVector() &&
38852 CondVT.getVectorElementType() == MVT::i1) {
38853 auto SelectableOp = [&TLI](SDValue Op) {
38854 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38855 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
38856 isNullConstant(Op.getOperand(1)) &&
38857 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
38858 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
38859 };
38860
38861 bool SelectableLHS = SelectableOp(LHS);
38862 bool SelectableRHS = SelectableOp(RHS);
38863 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
38864 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
38865
38866 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
38867 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
38868 : RHS.getOperand(0).getValueType();
38869 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38870 EVT SrcCondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumSrcElts);
38871 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
38872 VT.getSizeInBits());
38873 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
38874 VT.getSizeInBits());
38875 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
38876 DAG.getUNDEF(SrcCondVT), Cond,
38877 DAG.getIntPtrConstant(0, DL));
38878 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
38879 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
38880 }
38881 }
38882
38883 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
38884 return V;
38885
38886 // Canonicalize max and min:
38887 // (x > y) ? x : y -> (x >= y) ? x : y
38888 // (x < y) ? x : y -> (x <= y) ? x : y
38889 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
38890 // the need for an extra compare
38891 // against zero. e.g.
38892 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
38893 // subl %esi, %edi
38894 // testl %edi, %edi
38895 // movl $0, %eax
38896 // cmovgl %edi, %eax
38897 // =>
38898 // xorl %eax, %eax
38899 // subl %esi, $edi
38900 // cmovsl %eax, %edi
38901 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
38902 Cond.hasOneUse() &&
38903 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
38904 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
38905 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
38906 switch (CC) {
38907 default: break;
38908 case ISD::SETLT:
38909 case ISD::SETGT: {
38910 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
38911 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
38912 Cond.getOperand(0), Cond.getOperand(1), NewCC);
38913 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
38914 }
38915 }
38916 }
38917
38918 // Match VSELECTs into subs with unsigned saturation.
38919 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
38920 // psubus is available in SSE2 for i8 and i16 vectors.
38921 Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
38922 isPowerOf2_32(VT.getVectorNumElements()) &&
38923 (VT.getVectorElementType() == MVT::i8 ||
38924 VT.getVectorElementType() == MVT::i16)) {
38925 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
38926
38927 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
38928 // left side invert the predicate to simplify logic below.
38929 SDValue Other;
38930 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
38931 Other = RHS;
38932 CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
38933 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
38934 Other = LHS;
38935 }
38936
38937 if (Other.getNode() && Other->getNumOperands() == 2 &&
38938 Other->getOperand(0) == Cond.getOperand(0)) {
38939 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
38940 SDValue CondRHS = Cond->getOperand(1);
38941
38942 // Look for a general sub with unsigned saturation first.
38943 // x >= y ? x-y : 0 --> subus x, y
38944 // x > y ? x-y : 0 --> subus x, y
38945 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
38946 Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
38947 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
38948
38949 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
38950 if (isa<BuildVectorSDNode>(CondRHS)) {
38951 // If the RHS is a constant we have to reverse the const
38952 // canonicalization.
38953 // x > C-1 ? x+-C : 0 --> subus x, C
38954 auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
38955 return (!Op && !Cond) ||
38956 (Op && Cond &&
38957 Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
38958 };
38959 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
38960 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
38961 /*AllowUndefs*/ true)) {
38962 OpRHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
38963 OpRHS);
38964 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
38965 }
38966
38967 // Another special case: If C was a sign bit, the sub has been
38968 // canonicalized into a xor.
38969 // FIXME: Would it be better to use computeKnownBits to determine
38970 // whether it's safe to decanonicalize the xor?
38971 // x s< 0 ? x^C : 0 --> subus x, C
38972 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
38973 if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
38974 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
38975 OpRHSConst->getAPIntValue().isSignMask()) {
38976 // Note that we have to rebuild the RHS constant here to ensure we
38977 // don't rely on particular values of undef lanes.
38978 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
38979 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
38980 }
38981 }
38982 }
38983 }
38984 }
38985 }
38986
38987 // Match VSELECTs into add with unsigned saturation.
38988 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
38989 // paddus is available in SSE2 for i8 and i16 vectors.
38990 Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
38991 isPowerOf2_32(VT.getVectorNumElements()) &&
38992 (VT.getVectorElementType() == MVT::i8 ||
38993 VT.getVectorElementType() == MVT::i16)) {
38994 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
38995
38996 SDValue CondLHS = Cond->getOperand(0);
38997 SDValue CondRHS = Cond->getOperand(1);
38998
38999 // Check if one of the arms of the VSELECT is vector with all bits set.
39000 // If it's on the left side invert the predicate to simplify logic below.
39001 SDValue Other;
39002 if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
39003 Other = RHS;
39004 CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
39005 } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
39006 Other = LHS;
39007 }
39008
39009 if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
39010 SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
39011
39012 // Canonicalize condition operands.
39013 if (CC == ISD::SETUGE) {
39014 std::swap(CondLHS, CondRHS);
39015 CC = ISD::SETULE;
39016 }
39017
39018 // We can test against either of the addition operands.
39019 // x <= x+y ? x+y : ~0 --> addus x, y
39020 // x+y >= x ? x+y : ~0 --> addus x, y
39021 if (CC == ISD::SETULE && Other == CondRHS &&
39022 (OpLHS == CondLHS || OpRHS == CondLHS))
39023 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
39024
39025 if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
39026 CondLHS == OpLHS) {
39027 // If the RHS is a constant we have to reverse the const
39028 // canonicalization.
39029 // x > ~C ? x+C : ~0 --> addus x, C
39030 auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
39031 return Cond->getAPIntValue() == ~Op->getAPIntValue();
39032 };
39033 if (CC == ISD::SETULE &&
39034 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
39035 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
39036 }
39037 }
39038 }
39039
39040 // Check if the first operand is all zeros and Cond type is vXi1.
39041 // If this an avx512 target we can improve the use of zero masking by
39042 // swapping the operands and inverting the condition.
39043 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
39044 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
39045 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
39046 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
39047 // Invert the cond to not(cond) : xor(op,allones)=not(op)
39048 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
39049 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
39050 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
39051 }
39052
39053 // Early exit check
39054 if (!TLI.isTypeLegal(VT))
39055 return SDValue();
39056
39057 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
39058 return V;
39059
39060 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
39061 return V;
39062
39063 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
39064 return V;
39065
39066 // select(~Cond, X, Y) -> select(Cond, Y, X)
39067 if (CondVT.getScalarType() != MVT::i1)
39068 if (SDValue CondNot = IsNOT(Cond, DAG))
39069 return DAG.getNode(N->getOpcode(), DL, VT,
39070 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
39071
39072 // Try to optimize vXi1 selects if both operands are either all constants or
39073 // bitcasts from scalar integer type. In that case we can convert the operands
39074 // to integer and use an integer select which will be converted to a CMOV.
39075 // We need to take a little bit of care to avoid creating an i64 type after
39076 // type legalization.
39077 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
39078 VT.getVectorElementType() == MVT::i1 &&
39079 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
39080 MVT IntVT = MVT::getIntegerVT(VT.getVectorNumElements());
39081 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
39082 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
39083
39084 if ((LHSIsConst ||
39085 (LHS.getOpcode() == ISD::BITCAST &&
39086 LHS.getOperand(0).getValueType() == IntVT)) &&
39087 (RHSIsConst ||
39088 (RHS.getOpcode() == ISD::BITCAST &&
39089 RHS.getOperand(0).getValueType() == IntVT))) {
39090 if (LHSIsConst)
39091 LHS = combinevXi1ConstantToInteger(LHS, DAG);
39092 else
39093 LHS = LHS.getOperand(0);
39094
39095 if (RHSIsConst)
39096 RHS = combinevXi1ConstantToInteger(RHS, DAG);
39097 else
39098 RHS = RHS.getOperand(0);
39099
39100 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
39101 return DAG.getBitcast(VT, Select);
39102 }
39103 }
39104
39105 return SDValue();
39106}
39107
39108/// Combine:
39109/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
39110/// to:
39111/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
39112/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
39113/// Note that this is only legal for some op/cc combinations.
39114static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
39115 SelectionDAG &DAG,
39116 const X86Subtarget &Subtarget) {
39117 // This combine only operates on CMP-like nodes.
39118 if (!(Cmp.getOpcode() == X86ISD::CMP ||
39119 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
39120 return SDValue();
39121
39122 // Can't replace the cmp if it has more uses than the one we're looking at.
39123 // FIXME: We would like to be able to handle this, but would need to make sure
39124 // all uses were updated.
39125 if (!Cmp.hasOneUse())
39126 return SDValue();
39127
39128 // This only applies to variations of the common case:
39129 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
39130 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
39131 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
39132 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
39133 // Using the proper condcodes (see below), overflow is checked for.
39134
39135 // FIXME: We can generalize both constraints:
39136 // - XOR/OR/AND (if they were made to survive AtomicExpand)
39137 // - LHS != 1
39138 // if the result is compared.
39139
39140 SDValue CmpLHS = Cmp.getOperand(0);
39141 SDValue CmpRHS = Cmp.getOperand(1);
39142
39143 if (!CmpLHS.hasOneUse())
39144 return SDValue();
39145
39146 unsigned Opc = CmpLHS.getOpcode();
39147 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
39148 return SDValue();
39149
39150 SDValue OpRHS = CmpLHS.getOperand(2);
39151 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
39152 if (!OpRHSC)
39153 return SDValue();
39154
39155 APInt Addend = OpRHSC->getAPIntValue();
39156 if (Opc == ISD::ATOMIC_LOAD_SUB)
39157 Addend = -Addend;
39158
39159 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
39160 if (!CmpRHSC)
39161 return SDValue();
39162
39163 APInt Comparison = CmpRHSC->getAPIntValue();
39164
39165 // If the addend is the negation of the comparison value, then we can do
39166 // a full comparison by emitting the atomic arithmetic as a locked sub.
39167 if (Comparison == -Addend) {
39168 // The CC is fine, but we need to rewrite the LHS of the comparison as an
39169 // atomic sub.
39170 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
39171 auto AtomicSub = DAG.getAtomic(
39172 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
39173 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
39174 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
39175 AN->getMemOperand());
39176 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
39177 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
39178 DAG.getUNDEF(CmpLHS.getValueType()));
39179 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
39180 return LockOp;
39181 }
39182
39183 // We can handle comparisons with zero in a number of cases by manipulating
39184 // the CC used.
39185 if (!Comparison.isNullValue())
39186 return SDValue();
39187
39188 if (CC == X86::COND_S && Addend == 1)
39189 CC = X86::COND_LE;
39190 else if (CC == X86::COND_NS && Addend == 1)
39191 CC = X86::COND_G;
39192 else if (CC == X86::COND_G && Addend == -1)
39193 CC = X86::COND_GE;
39194 else if (CC == X86::COND_LE && Addend == -1)
39195 CC = X86::COND_L;
39196 else
39197 return SDValue();
39198
39199 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
39200 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
39201 DAG.getUNDEF(CmpLHS.getValueType()));
39202 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
39203 return LockOp;
39204}
39205
39206// Check whether a boolean test is testing a boolean value generated by
39207// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
39208// code.
39209//
39210// Simplify the following patterns:
39211// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
39212// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
39213// to (Op EFLAGS Cond)
39214//
39215// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
39216// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
39217// to (Op EFLAGS !Cond)
39218//
39219// where Op could be BRCOND or CMOV.
39220//
39221static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
39222 // This combine only operates on CMP-like nodes.
39223 if (!(Cmp.getOpcode() == X86ISD::CMP ||
39224 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
39225 return SDValue();
39226
39227 // Quit if not used as a boolean value.
39228 if (CC != X86::COND_E && CC != X86::COND_NE)
39229 return SDValue();
39230
39231 // Check CMP operands. One of them should be 0 or 1 and the other should be
39232 // an SetCC or extended from it.
39233 SDValue Op1 = Cmp.getOperand(0);
39234 SDValue Op2 = Cmp.getOperand(1);
39235
39236 SDValue SetCC;
39237 const ConstantSDNode* C = nullptr;
39238 bool needOppositeCond = (CC == X86::COND_E);
39239 bool checkAgainstTrue = false; // Is it a comparison against 1?
39240
39241 if ((C = dyn_cast<ConstantSDNode>(Op1)))
39242 SetCC = Op2;
39243 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
39244 SetCC = Op1;
39245 else // Quit if all operands are not constants.
39246 return SDValue();
39247
39248 if (C->getZExtValue() == 1) {
39249 needOppositeCond = !needOppositeCond;
39250 checkAgainstTrue = true;
39251 } else if (C->getZExtValue() != 0)
39252 // Quit if the constant is neither 0 or 1.
39253 return SDValue();
39254
39255 bool truncatedToBoolWithAnd = false;
39256 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
39257 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
39258 SetCC.getOpcode() == ISD::TRUNCATE ||
39259 SetCC.getOpcode() == ISD::AND) {
39260 if (SetCC.getOpcode() == ISD::AND) {
39261 int OpIdx = -1;
39262 if (isOneConstant(SetCC.getOperand(0)))
39263 OpIdx = 1;
39264 if (isOneConstant(SetCC.getOperand(1)))
39265 OpIdx = 0;
39266 if (OpIdx < 0)
39267 break;
39268 SetCC = SetCC.getOperand(OpIdx);
39269 truncatedToBoolWithAnd = true;
39270 } else
39271 SetCC = SetCC.getOperand(0);
39272 }
39273
39274 switch (SetCC.getOpcode()) {
39275 case X86ISD::SETCC_CARRY:
39276 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
39277 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
39278 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
39279 // truncated to i1 using 'and'.
39280 if (checkAgainstTrue && !truncatedToBoolWithAnd)
39281 break;
39282 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39283, __PRETTY_FUNCTION__))
39283 "Invalid use of SETCC_CARRY!")((X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B
&& "Invalid use of SETCC_CARRY!") ? static_cast<void
> (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39283, __PRETTY_FUNCTION__))
;
39284 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39285 case X86ISD::SETCC:
39286 // Set the condition code or opposite one if necessary.
39287 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
39288 if (needOppositeCond)
39289 CC = X86::GetOppositeBranchCondition(CC);
39290 return SetCC.getOperand(1);
39291 case X86ISD::CMOV: {
39292 // Check whether false/true value has canonical one, i.e. 0 or 1.
39293 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
39294 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
39295 // Quit if true value is not a constant.
39296 if (!TVal)
39297 return SDValue();
39298 // Quit if false value is not a constant.
39299 if (!FVal) {
39300 SDValue Op = SetCC.getOperand(0);
39301 // Skip 'zext' or 'trunc' node.
39302 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
39303 Op.getOpcode() == ISD::TRUNCATE)
39304 Op = Op.getOperand(0);
39305 // A special case for rdrand/rdseed, where 0 is set if false cond is
39306 // found.
39307 if ((Op.getOpcode() != X86ISD::RDRAND &&
39308 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
39309 return SDValue();
39310 }
39311 // Quit if false value is not the constant 0 or 1.
39312 bool FValIsFalse = true;
39313 if (FVal && FVal->getZExtValue() != 0) {
39314 if (FVal->getZExtValue() != 1)
39315 return SDValue();
39316 // If FVal is 1, opposite cond is needed.
39317 needOppositeCond = !needOppositeCond;
39318 FValIsFalse = false;
39319 }
39320 // Quit if TVal is not the constant opposite of FVal.
39321 if (FValIsFalse && TVal->getZExtValue() != 1)
39322 return SDValue();
39323 if (!FValIsFalse && TVal->getZExtValue() != 0)
39324 return SDValue();
39325 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
39326 if (needOppositeCond)
39327 CC = X86::GetOppositeBranchCondition(CC);
39328 return SetCC.getOperand(3);
39329 }
39330 }
39331
39332 return SDValue();
39333}
39334
39335/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
39336/// Match:
39337/// (X86or (X86setcc) (X86setcc))
39338/// (X86cmp (and (X86setcc) (X86setcc)), 0)
39339static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
39340 X86::CondCode &CC1, SDValue &Flags,
39341 bool &isAnd) {
39342 if (Cond->getOpcode() == X86ISD::CMP) {
39343 if (!isNullConstant(Cond->getOperand(1)))
39344 return false;
39345
39346 Cond = Cond->getOperand(0);
39347 }
39348
39349 isAnd = false;
39350
39351 SDValue SetCC0, SetCC1;
39352 switch (Cond->getOpcode()) {
39353 default: return false;
39354 case ISD::AND:
39355 case X86ISD::AND:
39356 isAnd = true;
39357 LLVM_FALLTHROUGH[[gnu::fallthrough]];
39358 case ISD::OR:
39359 case X86ISD::OR:
39360 SetCC0 = Cond->getOperand(0);
39361 SetCC1 = Cond->getOperand(1);
39362 break;
39363 };
39364
39365 // Make sure we have SETCC nodes, using the same flags value.
39366 if (SetCC0.getOpcode() != X86ISD::SETCC ||
39367 SetCC1.getOpcode() != X86ISD::SETCC ||
39368 SetCC0->getOperand(1) != SetCC1->getOperand(1))
39369 return false;
39370
39371 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
39372 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
39373 Flags = SetCC0->getOperand(1);
39374 return true;
39375}
39376
39377// When legalizing carry, we create carries via add X, -1
39378// If that comes from an actual carry, via setcc, we use the
39379// carry directly.
39380static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
39381 if (EFLAGS.getOpcode() == X86ISD::ADD) {
39382 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
39383 SDValue Carry = EFLAGS.getOperand(0);
39384 while (Carry.getOpcode() == ISD::TRUNCATE ||
39385 Carry.getOpcode() == ISD::ZERO_EXTEND ||
39386 Carry.getOpcode() == ISD::SIGN_EXTEND ||
39387 Carry.getOpcode() == ISD::ANY_EXTEND ||
39388 (Carry.getOpcode() == ISD::AND &&
39389 isOneConstant(Carry.getOperand(1))))
39390 Carry = Carry.getOperand(0);
39391 if (Carry.getOpcode() == X86ISD::SETCC ||
39392 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
39393 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
39394 uint64_t CarryCC = Carry.getConstantOperandVal(0);
39395 SDValue CarryOp1 = Carry.getOperand(1);
39396 if (CarryCC == X86::COND_B)
39397 return CarryOp1;
39398 if (CarryCC == X86::COND_A) {
39399 // Try to convert COND_A into COND_B in an attempt to facilitate
39400 // materializing "setb reg".
39401 //
39402 // Do not flip "e > c", where "c" is a constant, because Cmp
39403 // instruction cannot take an immediate as its first operand.
39404 //
39405 if (CarryOp1.getOpcode() == X86ISD::SUB &&
39406 CarryOp1.getNode()->hasOneUse() &&
39407 CarryOp1.getValueType().isInteger() &&
39408 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
39409 SDValue SubCommute =
39410 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
39411 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
39412 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
39413 }
39414 }
39415 // If this is a check of the z flag of an add with 1, switch to the
39416 // C flag.
39417 if (CarryCC == X86::COND_E &&
39418 CarryOp1.getOpcode() == X86ISD::ADD &&
39419 isOneConstant(CarryOp1.getOperand(1)))
39420 return CarryOp1;
39421 }
39422 }
39423 }
39424
39425 return SDValue();
39426}
39427
39428/// Optimize an EFLAGS definition used according to the condition code \p CC
39429/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
39430/// uses of chain values.
39431static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
39432 SelectionDAG &DAG,
39433 const X86Subtarget &Subtarget) {
39434 if (CC == X86::COND_B)
39435 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
39436 return Flags;
39437
39438 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
39439 return R;
39440 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
39441}
39442
39443/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
39444static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
39445 TargetLowering::DAGCombinerInfo &DCI,
39446 const X86Subtarget &Subtarget) {
39447 SDLoc DL(N);
39448
39449 SDValue FalseOp = N->getOperand(0);
39450 SDValue TrueOp = N->getOperand(1);
39451 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
39452 SDValue Cond = N->getOperand(3);
39453
39454 // cmov X, X, ?, ? --> X
39455 if (TrueOp == FalseOp)
39456 return TrueOp;
39457
39458 // Try to simplify the EFLAGS and condition code operands.
39459 // We can't always do this as FCMOV only supports a subset of X86 cond.
39460 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
39461 if (!(FalseOp.getValueType() == MVT::f80 ||
39462 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
39463 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
39464 !Subtarget.hasCMov() || hasFPCMov(CC)) {
39465 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
39466 Flags};
39467 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
39468 }
39469 }
39470
39471 // If this is a select between two integer constants, try to do some
39472 // optimizations. Note that the operands are ordered the opposite of SELECT
39473 // operands.
39474 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
39475 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
39476 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
39477 // larger than FalseC (the false value).
39478 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
39479 CC = X86::GetOppositeBranchCondition(CC);
39480 std::swap(TrueC, FalseC);
39481 std::swap(TrueOp, FalseOp);
39482 }
39483
39484 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
39485 // This is efficient for any integer data type (including i8/i16) and
39486 // shift amount.
39487 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
39488 Cond = getSETCC(CC, Cond, DL, DAG);
39489
39490 // Zero extend the condition if needed.
39491 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
39492
39493 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
39494 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
39495 DAG.getConstant(ShAmt, DL, MVT::i8));
39496 return Cond;
39497 }
39498
39499 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
39500 // for any integer data type, including i8/i16.
39501 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
39502 Cond = getSETCC(CC, Cond, DL, DAG);
39503
39504 // Zero extend the condition if needed.
39505 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
39506 FalseC->getValueType(0), Cond);
39507 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
39508 SDValue(FalseC, 0));
39509 return Cond;
39510 }
39511
39512 // Optimize cases that will turn into an LEA instruction. This requires
39513 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
39514 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
39515 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
39516 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&((Diff.getBitWidth() == N->getValueType(0).getSizeInBits()
&& "Implicit constant truncation") ? static_cast<
void> (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39517, __PRETTY_FUNCTION__))
39517 "Implicit constant truncation")((Diff.getBitWidth() == N->getValueType(0).getSizeInBits()
&& "Implicit constant truncation") ? static_cast<
void> (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39517, __PRETTY_FUNCTION__))
;
39518
39519 bool isFastMultiplier = false;
39520 if (Diff.ult(10)) {
39521 switch (Diff.getZExtValue()) {
39522 default: break;
39523 case 1: // result = add base, cond
39524 case 2: // result = lea base( , cond*2)
39525 case 3: // result = lea base(cond, cond*2)
39526 case 4: // result = lea base( , cond*4)
39527 case 5: // result = lea base(cond, cond*4)
39528 case 8: // result = lea base( , cond*8)
39529 case 9: // result = lea base(cond, cond*8)
39530 isFastMultiplier = true;
39531 break;
39532 }
39533 }
39534
39535 if (isFastMultiplier) {
39536 Cond = getSETCC(CC, Cond, DL ,DAG);
39537 // Zero extend the condition if needed.
39538 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
39539 Cond);
39540 // Scale the condition by the difference.
39541 if (Diff != 1)
39542 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
39543 DAG.getConstant(Diff, DL, Cond.getValueType()));
39544
39545 // Add the base if non-zero.
39546 if (FalseC->getAPIntValue() != 0)
39547 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
39548 SDValue(FalseC, 0));
39549 return Cond;
39550 }
39551 }
39552 }
39553 }
39554
39555 // Handle these cases:
39556 // (select (x != c), e, c) -> select (x != c), e, x),
39557 // (select (x == c), c, e) -> select (x == c), x, e)
39558 // where the c is an integer constant, and the "select" is the combination
39559 // of CMOV and CMP.
39560 //
39561 // The rationale for this change is that the conditional-move from a constant
39562 // needs two instructions, however, conditional-move from a register needs
39563 // only one instruction.
39564 //
39565 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
39566 // some instruction-combining opportunities. This opt needs to be
39567 // postponed as late as possible.
39568 //
39569 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
39570 // the DCI.xxxx conditions are provided to postpone the optimization as
39571 // late as possible.
39572
39573 ConstantSDNode *CmpAgainst = nullptr;
39574 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
39575 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
39576 !isa<ConstantSDNode>(Cond.getOperand(0))) {
39577
39578 if (CC == X86::COND_NE &&
39579 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
39580 CC = X86::GetOppositeBranchCondition(CC);
39581 std::swap(TrueOp, FalseOp);
39582 }
39583
39584 if (CC == X86::COND_E &&
39585 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
39586 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
39587 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
39588 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
39589 }
39590 }
39591 }
39592
39593 // Fold and/or of setcc's to double CMOV:
39594 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
39595 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
39596 //
39597 // This combine lets us generate:
39598 // cmovcc1 (jcc1 if we don't have CMOV)
39599 // cmovcc2 (same)
39600 // instead of:
39601 // setcc1
39602 // setcc2
39603 // and/or
39604 // cmovne (jne if we don't have CMOV)
39605 // When we can't use the CMOV instruction, it might increase branch
39606 // mispredicts.
39607 // When we can use CMOV, or when there is no mispredict, this improves
39608 // throughput and reduces register pressure.
39609 //
39610 if (CC == X86::COND_NE) {
39611 SDValue Flags;
39612 X86::CondCode CC0, CC1;
39613 bool isAndSetCC;
39614 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
39615 if (isAndSetCC) {
39616 std::swap(FalseOp, TrueOp);
39617 CC0 = X86::GetOppositeBranchCondition(CC0);
39618 CC1 = X86::GetOppositeBranchCondition(CC1);
39619 }
39620
39621 SDValue LOps[] = {FalseOp, TrueOp,
39622 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
39623 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
39624 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
39625 Flags};
39626 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
39627 return CMOV;
39628 }
39629 }
39630
39631 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
39632 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
39633 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
39634 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
39635 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
39636 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
39637 SDValue Add = TrueOp;
39638 SDValue Const = FalseOp;
39639 // Canonicalize the condition code for easier matching and output.
39640 if (CC == X86::COND_E)
39641 std::swap(Add, Const);
39642
39643 // We might have replaced the constant in the cmov with the LHS of the
39644 // compare. If so change it to the RHS of the compare.
39645 if (Const == Cond.getOperand(0))
39646 Const = Cond.getOperand(1);
39647
39648 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
39649 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
39650 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
39651 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
39652 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
39653 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
39654 EVT VT = N->getValueType(0);
39655 // This should constant fold.
39656 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
39657 SDValue CMov =
39658 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
39659 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
39660 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
39661 }
39662 }
39663
39664 return SDValue();
39665}
39666
39667/// Different mul shrinking modes.
39668enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
39669
39670static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
39671 EVT VT = N->getOperand(0).getValueType();
39672 if (VT.getScalarSizeInBits() != 32)
39673 return false;
39674
39675 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")((N->getNumOperands() == 2 && "NumOperands of Mul are 2"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39675, __PRETTY_FUNCTION__))
;
39676 unsigned SignBits[2] = {1, 1};
39677 bool IsPositive[2] = {false, false};
39678 for (unsigned i = 0; i < 2; i++) {
39679 SDValue Opd = N->getOperand(i);
39680
39681 SignBits[i] = DAG.ComputeNumSignBits(Opd);
39682 IsPositive[i] = DAG.SignBitIsZero(Opd);
39683 }
39684
39685 bool AllPositive = IsPositive[0] && IsPositive[1];
39686 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
39687 // When ranges are from -128 ~ 127, use MULS8 mode.
39688 if (MinSignBits >= 25)
39689 Mode = ShrinkMode::MULS8;
39690 // When ranges are from 0 ~ 255, use MULU8 mode.
39691 else if (AllPositive && MinSignBits >= 24)
39692 Mode = ShrinkMode::MULU8;
39693 // When ranges are from -32768 ~ 32767, use MULS16 mode.
39694 else if (MinSignBits >= 17)
39695 Mode = ShrinkMode::MULS16;
39696 // When ranges are from 0 ~ 65535, use MULU16 mode.
39697 else if (AllPositive && MinSignBits >= 16)
39698 Mode = ShrinkMode::MULU16;
39699 else
39700 return false;
39701 return true;
39702}
39703
39704/// When the operands of vector mul are extended from smaller size values,
39705/// like i8 and i16, the type of mul may be shrinked to generate more
39706/// efficient code. Two typical patterns are handled:
39707/// Pattern1:
39708/// %2 = sext/zext <N x i8> %1 to <N x i32>
39709/// %4 = sext/zext <N x i8> %3 to <N x i32>
39710// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
39711/// %5 = mul <N x i32> %2, %4
39712///
39713/// Pattern2:
39714/// %2 = zext/sext <N x i16> %1 to <N x i32>
39715/// %4 = zext/sext <N x i16> %3 to <N x i32>
39716/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
39717/// %5 = mul <N x i32> %2, %4
39718///
39719/// There are four mul shrinking modes:
39720/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
39721/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
39722/// generate pmullw+sext32 for it (MULS8 mode).
39723/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
39724/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
39725/// generate pmullw+zext32 for it (MULU8 mode).
39726/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
39727/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
39728/// generate pmullw+pmulhw for it (MULS16 mode).
39729/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
39730/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
39731/// generate pmullw+pmulhuw for it (MULU16 mode).
39732static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
39733 const X86Subtarget &Subtarget) {
39734 // Check for legality
39735 // pmullw/pmulhw are not supported by SSE.
39736 if (!Subtarget.hasSSE2())
39737 return SDValue();
39738
39739 // Check for profitability
39740 // pmulld is supported since SSE41. It is better to use pmulld
39741 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
39742 // the expansion.
39743 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
39744 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
39745 return SDValue();
39746
39747 ShrinkMode Mode;
39748 if (!canReduceVMulWidth(N, DAG, Mode))
39749 return SDValue();
39750
39751 SDLoc DL(N);
39752 SDValue N0 = N->getOperand(0);
39753 SDValue N1 = N->getOperand(1);
39754 EVT VT = N->getOperand(0).getValueType();
39755 unsigned NumElts = VT.getVectorNumElements();
39756 if ((NumElts % 2) != 0)
39757 return SDValue();
39758
39759 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
39760
39761 // Shrink the operands of mul.
39762 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
39763 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
39764
39765 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
39766 // lower part is needed.
39767 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
39768 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
39769 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
39770 : ISD::SIGN_EXTEND,
39771 DL, VT, MulLo);
39772
39773 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
39774 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
39775 // the higher part is also needed.
39776 SDValue MulHi =
39777 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
39778 ReducedVT, NewN0, NewN1);
39779
39780 // Repack the lower part and higher part result of mul into a wider
39781 // result.
39782 // Generate shuffle functioning as punpcklwd.
39783 SmallVector<int, 16> ShuffleMask(NumElts);
39784 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
39785 ShuffleMask[2 * i] = i;
39786 ShuffleMask[2 * i + 1] = i + NumElts;
39787 }
39788 SDValue ResLo =
39789 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
39790 ResLo = DAG.getBitcast(ResVT, ResLo);
39791 // Generate shuffle functioning as punpckhwd.
39792 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
39793 ShuffleMask[2 * i] = i + NumElts / 2;
39794 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
39795 }
39796 SDValue ResHi =
39797 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
39798 ResHi = DAG.getBitcast(ResVT, ResHi);
39799 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
39800}
39801
39802static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
39803 EVT VT, const SDLoc &DL) {
39804
39805 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
39806 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
39807 DAG.getConstant(Mult, DL, VT));
39808 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
39809 DAG.getConstant(Shift, DL, MVT::i8));
39810 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
39811 N->getOperand(0));
39812 return Result;
39813 };
39814
39815 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
39816 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
39817 DAG.getConstant(Mul1, DL, VT));
39818 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
39819 DAG.getConstant(Mul2, DL, VT));
39820 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
39821 N->getOperand(0));
39822 return Result;
39823 };
39824
39825 switch (MulAmt) {
39826 default:
39827 break;
39828 case 11:
39829 // mul x, 11 => add ((shl (mul x, 5), 1), x)
39830 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
39831 case 21:
39832 // mul x, 21 => add ((shl (mul x, 5), 2), x)
39833 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
39834 case 41:
39835 // mul x, 41 => add ((shl (mul x, 5), 3), x)
39836 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
39837 case 22:
39838 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
39839 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
39840 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
39841 case 19:
39842 // mul x, 19 => add ((shl (mul x, 9), 1), x)
39843 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
39844 case 37:
39845 // mul x, 37 => add ((shl (mul x, 9), 2), x)
39846 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
39847 case 73:
39848 // mul x, 73 => add ((shl (mul x, 9), 3), x)
39849 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
39850 case 13:
39851 // mul x, 13 => add ((shl (mul x, 3), 2), x)
39852 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
39853 case 23:
39854 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
39855 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
39856 case 26:
39857 // mul x, 26 => add ((mul (mul x, 5), 5), x)
39858 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
39859 case 28:
39860 // mul x, 28 => add ((mul (mul x, 9), 3), x)
39861 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
39862 case 29:
39863 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
39864 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
39865 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
39866 }
39867
39868 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
39869 // by a single LEA.
39870 // First check if this a sum of two power of 2s because that's easy. Then
39871 // count how many zeros are up to the first bit.
39872 // TODO: We can do this even without LEA at a cost of two shifts and an add.
39873 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
39874 unsigned ScaleShift = countTrailingZeros(MulAmt);
39875 if (ScaleShift >= 1 && ScaleShift < 4) {
39876 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
39877 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
39878 DAG.getConstant(ShiftAmt, DL, MVT::i8));
39879 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
39880 DAG.getConstant(ScaleShift, DL, MVT::i8));
39881 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
39882 }
39883 }
39884
39885 return SDValue();
39886}
39887
39888// If the upper 17 bits of each element are zero then we can use PMADDWD,
39889// which is always at least as quick as PMULLD, except on KNL.
39890static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
39891 const X86Subtarget &Subtarget) {
39892 if (!Subtarget.hasSSE2())
39893 return SDValue();
39894
39895 if (Subtarget.isPMADDWDSlow())
39896 return SDValue();
39897
39898 EVT VT = N->getValueType(0);
39899
39900 // Only support vXi32 vectors.
39901 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
39902 return SDValue();
39903
39904 // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
39905 // Also allow v2i32 if it will be widened.
39906 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
39907 if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
39908 return SDValue();
39909
39910 SDValue N0 = N->getOperand(0);
39911 SDValue N1 = N->getOperand(1);
39912
39913 // If we are zero extending two steps without SSE4.1, its better to reduce
39914 // the vmul width instead.
39915 if (!Subtarget.hasSSE41() &&
39916 (N0.getOpcode() == ISD::ZERO_EXTEND &&
39917 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
39918 (N1.getOpcode() == ISD::ZERO_EXTEND &&
39919 N1.getOperand(0).getScalarValueSizeInBits() <= 8))
39920 return SDValue();
39921
39922 APInt Mask17 = APInt::getHighBitsSet(32, 17);
39923 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
39924 !DAG.MaskedValueIsZero(N0, Mask17))
39925 return SDValue();
39926
39927 // Use SplitOpsAndApply to handle AVX splitting.
39928 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39929 ArrayRef<SDValue> Ops) {
39930 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
39931 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
39932 };
39933 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
39934 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
39935 PMADDWDBuilder);
39936}
39937
39938static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
39939 const X86Subtarget &Subtarget) {
39940 if (!Subtarget.hasSSE2())
39941 return SDValue();
39942
39943 EVT VT = N->getValueType(0);
39944
39945 // Only support vXi64 vectors.
39946 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
39947 VT.getVectorNumElements() < 2 ||
39948 !isPowerOf2_32(VT.getVectorNumElements()))
39949 return SDValue();
39950
39951 SDValue N0 = N->getOperand(0);
39952 SDValue N1 = N->getOperand(1);
39953
39954 // MULDQ returns the 64-bit result of the signed multiplication of the lower
39955 // 32-bits. We can lower with this if the sign bits stretch that far.
39956 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
39957 DAG.ComputeNumSignBits(N1) > 32) {
39958 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39959 ArrayRef<SDValue> Ops) {
39960 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
39961 };
39962 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
39963 PMULDQBuilder, /*CheckBWI*/false);
39964 }
39965
39966 // If the upper bits are zero we can use a single pmuludq.
39967 APInt Mask = APInt::getHighBitsSet(64, 32);
39968 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
39969 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39970 ArrayRef<SDValue> Ops) {
39971 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
39972 };
39973 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
39974 PMULUDQBuilder, /*CheckBWI*/false);
39975 }
39976
39977 return SDValue();
39978}
39979
39980/// Optimize a single multiply with constant into two operations in order to
39981/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
39982static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
39983 TargetLowering::DAGCombinerInfo &DCI,
39984 const X86Subtarget &Subtarget) {
39985 EVT VT = N->getValueType(0);
39986
39987 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
39988 return V;
39989
39990 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
39991 return V;
39992
39993 if (DCI.isBeforeLegalize() && VT.isVector())
39994 return reduceVMULWidth(N, DAG, Subtarget);
39995
39996 if (!MulConstantOptimization)
39997 return SDValue();
39998 // An imul is usually smaller than the alternative sequence.
39999 if (DAG.getMachineFunction().getFunction().hasMinSize())
40000 return SDValue();
40001
40002 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
40003 return SDValue();
40004
40005 if (VT != MVT::i64 && VT != MVT::i32)
40006 return SDValue();
40007
40008 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
40009 if (!C)
40010 return SDValue();
40011 if (isPowerOf2_64(C->getZExtValue()))
40012 return SDValue();
40013
40014 int64_t SignMulAmt = C->getSExtValue();
40015 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")((SignMulAmt != (-9223372036854775807L -1) && "Int min should have been handled!"
) ? static_cast<void> (0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40015, __PRETTY_FUNCTION__))
;
40016 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
40017
40018 SDLoc DL(N);
40019 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
40020 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
40021 DAG.getConstant(AbsMulAmt, DL, VT));
40022 if (SignMulAmt < 0)
40023 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
40024 NewMul);
40025
40026 return NewMul;
40027 }
40028
40029 uint64_t MulAmt1 = 0;
40030 uint64_t MulAmt2 = 0;
40031 if ((AbsMulAmt % 9) == 0) {
40032 MulAmt1 = 9;
40033 MulAmt2 = AbsMulAmt / 9;
40034 } else if ((AbsMulAmt % 5) == 0) {
40035 MulAmt1 = 5;
40036 MulAmt2 = AbsMulAmt / 5;
40037 } else if ((AbsMulAmt % 3) == 0) {
40038 MulAmt1 = 3;
40039 MulAmt2 = AbsMulAmt / 3;
40040 }
40041
40042 SDValue NewMul;
40043 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
40044 if (MulAmt2 &&
40045 (isPowerOf2_64(MulAmt2) ||
40046 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
40047
40048 if (isPowerOf2_64(MulAmt2) &&
40049 !(SignMulAmt >= 0 && N->hasOneUse() &&
40050 N->use_begin()->getOpcode() == ISD::ADD))
40051 // If second multiplifer is pow2, issue it first. We want the multiply by
40052 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
40053 // is an add. Only do this for positive multiply amounts since the
40054 // negate would prevent it from being used as an address mode anyway.
40055 std::swap(MulAmt1, MulAmt2);
40056
40057 if (isPowerOf2_64(MulAmt1))
40058 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
40059 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
40060 else
40061 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
40062 DAG.getConstant(MulAmt1, DL, VT));
40063
40064 if (isPowerOf2_64(MulAmt2))
40065 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
40066 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
40067 else
40068 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
40069 DAG.getConstant(MulAmt2, DL, VT));
40070
40071 // Negate the result.
40072 if (SignMulAmt < 0)
40073 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
40074 NewMul);
40075 } else if (!Subtarget.slowLEA())
40076 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
40077
40078 if (!NewMul) {
40079 assert(C->getZExtValue() != 0 &&((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40082, __PRETTY_FUNCTION__))
40080 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40082, __PRETTY_FUNCTION__))
40081 "Both cases that could cause potential overflows should have "((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40082, __PRETTY_FUNCTION__))
40082 "already been handled.")((C->getZExtValue() != 0 && C->getZExtValue() !=
(VT == MVT::i64 ? (18446744073709551615UL) : (4294967295U)) &&
"Both cases that could cause potential overflows should have "
"already been handled.") ? static_cast<void> (0) : __assert_fail
("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40082, __PRETTY_FUNCTION__))
;
40083 if (isPowerOf2_64(AbsMulAmt - 1)) {
40084 // (mul x, 2^N + 1) => (add (shl x, N), x)
40085 NewMul = DAG.getNode(
40086 ISD::ADD, DL, VT, N->getOperand(0),
40087 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
40088 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
40089 MVT::i8)));
40090 // To negate, subtract the number from zero
40091 if (SignMulAmt < 0)
40092 NewMul = DAG.getNode(ISD::SUB, DL, VT,
40093 DAG.getConstant(0, DL, VT), NewMul);
40094 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
40095 // (mul x, 2^N - 1) => (sub (shl x, N), x)
40096 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
40097 DAG.getConstant(Log2_64(AbsMulAmt + 1),
40098 DL, MVT::i8));
40099 // To negate, reverse the operands of the subtract.
40100 if (SignMulAmt < 0)
40101 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
40102 else
40103 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
40104 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
40105 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
40106 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
40107 DAG.getConstant(Log2_64(AbsMulAmt - 2),
40108 DL, MVT::i8));
40109 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
40110 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
40111 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
40112 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
40113 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
40114 DAG.getConstant(Log2_64(AbsMulAmt + 2),
40115 DL, MVT::i8));
40116 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
40117 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
40118 }
40119 }
40120
40121 return NewMul;
40122}
40123
40124static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
40125 SDValue N0 = N->getOperand(0);
40126 SDValue N1 = N->getOperand(1);
40127 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
40128 EVT VT = N0.getValueType();
40129
40130 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
40131 // since the result of setcc_c is all zero's or all ones.
40132 if (VT.isInteger() && !VT.isVector() &&
40133 N1C && N0.getOpcode() == ISD::AND &&
40134 N0.getOperand(1).getOpcode() == ISD::Constant) {
40135 SDValue N00 = N0.getOperand(0);
40136 APInt Mask = N0.getConstantOperandAPInt(1);
40137 Mask <<= N1C->getAPIntValue();
40138 bool MaskOK = false;
40139 // We can handle cases concerning bit-widening nodes containing setcc_c if
40140 // we carefully interrogate the mask to make sure we are semantics
40141 // preserving.
40142 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
40143 // of the underlying setcc_c operation if the setcc_c was zero extended.
40144 // Consider the following example:
40145 // zext(setcc_c) -> i32 0x0000FFFF
40146 // c1 -> i32 0x0000FFFF
40147 // c2 -> i32 0x00000001
40148 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
40149 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
40150 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
40151 MaskOK = true;
40152 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
40153 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
40154 MaskOK = true;
40155 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
40156 N00.getOpcode() == ISD::ANY_EXTEND) &&
40157 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
40158 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
40159 }
40160 if (MaskOK && Mask != 0) {
40161 SDLoc DL(N);
40162 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
40163 }
40164 }
40165
40166 // Hardware support for vector shifts is sparse which makes us scalarize the
40167 // vector operations in many cases. Also, on sandybridge ADD is faster than
40168 // shl.
40169 // (shl V, 1) -> add V,V
40170 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
40171 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
40172 assert(N0.getValueType().isVector() && "Invalid vector shift type")((N0.getValueType().isVector() && "Invalid vector shift type"
) ? static_cast<void> (0) : __assert_fail ("N0.getValueType().isVector() && \"Invalid vector shift type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40172, __PRETTY_FUNCTION__))
;
40173 // We shift all of the values by one. In many cases we do not have
40174 // hardware support for this operation. This is better expressed as an ADD
40175 // of two values.
40176 if (N1SplatC->isOne())
40177 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
40178 }
40179
40180 return SDValue();
40181}
40182
40183static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
40184 SDValue N0 = N->getOperand(0);
40185 SDValue N1 = N->getOperand(1);
40186 EVT VT = N0.getValueType();
40187 unsigned Size = VT.getSizeInBits();
40188
40189 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
40190 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
40191 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
40192 // depending on sign of (SarConst - [56,48,32,24,16])
40193
40194 // sexts in X86 are MOVs. The MOVs have the same code size
40195 // as above SHIFTs (only SHIFT on 1 has lower code size).
40196 // However the MOVs have 2 advantages to a SHIFT:
40197 // 1. MOVs can write to a register that differs from source
40198 // 2. MOVs accept memory operands
40199
40200 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
40201 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
40202 N0.getOperand(1).getOpcode() != ISD::Constant)
40203 return SDValue();
40204
40205 SDValue N00 = N0.getOperand(0);
40206 SDValue N01 = N0.getOperand(1);
40207 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
40208 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
40209 EVT CVT = N1.getValueType();
40210
40211 if (SarConst.isNegative())
40212 return SDValue();
40213
40214 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
40215 unsigned ShiftSize = SVT.getSizeInBits();
40216 // skipping types without corresponding sext/zext and
40217 // ShlConst that is not one of [56,48,32,24,16]
40218 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
40219 continue;
40220 SDLoc DL(N);
40221 SDValue NN =
40222 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
40223 SarConst = SarConst - (Size - ShiftSize);
40224 if (SarConst == 0)
40225 return NN;
40226 else if (SarConst.isNegative())
40227 return DAG.getNode(ISD::SHL, DL, VT, NN,
40228 DAG.getConstant(-SarConst, DL, CVT));
40229 else
40230 return DAG.getNode(ISD::SRA, DL, VT, NN,
40231 DAG.getConstant(SarConst, DL, CVT));
40232 }
40233 return SDValue();
40234}
40235
40236static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
40237 TargetLowering::DAGCombinerInfo &DCI) {
40238 SDValue N0 = N->getOperand(0);
40239 SDValue N1 = N->getOperand(1);
40240 EVT VT = N0.getValueType();
40241
40242 // Only do this on the last DAG combine as it can interfere with other
40243 // combines.
40244 if (!DCI.isAfterLegalizeDAG())
40245 return SDValue();
40246
40247 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
40248 // TODO: This is a generic DAG combine that became an x86-only combine to
40249 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
40250 // and-not ('andn').
40251 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
40252 return SDValue();
40253
40254 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
40255 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
40256 if (!ShiftC || !AndC)
40257 return SDValue();
40258
40259 // If we can shrink the constant mask below 8-bits or 32-bits, then this
40260 // transform should reduce code size. It may also enable secondary transforms
40261 // from improved known-bits analysis or instruction selection.
40262 APInt MaskVal = AndC->getAPIntValue();
40263
40264 // If this can be matched by a zero extend, don't optimize.
40265 if (MaskVal.isMask()) {
40266 unsigned TO = MaskVal.countTrailingOnes();
40267 if (TO >= 8 && isPowerOf2_32(TO))
40268 return SDValue();
40269 }
40270
40271 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
40272 unsigned OldMaskSize = MaskVal.getMinSignedBits();
40273 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
40274 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
40275 (OldMaskSize > 32 && NewMaskSize <= 32)) {
40276 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
40277 SDLoc DL(N);
40278 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
40279 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
40280 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
40281 }
40282 return SDValue();
40283}
40284
40285static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
40286 TargetLowering::DAGCombinerInfo &DCI,
40287 const X86Subtarget &Subtarget) {
40288 unsigned Opcode = N->getOpcode();
40289 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected shift opcode") ? static_cast<void> (0) : __assert_fail
("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40290, __PRETTY_FUNCTION__))
40290 "Unexpected shift opcode")(((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
"Unexpected shift opcode") ? static_cast<void> (0) : __assert_fail
("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40290, __PRETTY_FUNCTION__))
;
40291
40292 EVT VT = N->getValueType(0);
40293 SDValue N0 = N->getOperand(0);
40294 SDValue N1 = N->getOperand(1);
40295 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
40296 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
40297 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40299, __PRETTY_FUNCTION__))
40298 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40299, __PRETTY_FUNCTION__))
40299 "Unexpected PACKSS/PACKUS input type")((N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1
.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type"
) ? static_cast<void> (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40299, __PRETTY_FUNCTION__))
;
40300
40301 bool IsSigned = (X86ISD::PACKSS == Opcode);
40302
40303 // Constant Folding.
40304 APInt UndefElts0, UndefElts1;
40305 SmallVector<APInt, 32> EltBits0, EltBits1;
40306 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
40307 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
40308 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
40309 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
40310 unsigned NumLanes = VT.getSizeInBits() / 128;
40311 unsigned NumDstElts = VT.getVectorNumElements();
40312 unsigned NumSrcElts = NumDstElts / 2;
40313 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
40314 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
40315
40316 APInt Undefs(NumDstElts, 0);
40317 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
40318 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
40319 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
40320 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
40321 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
40322 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
40323
40324 if (UndefElts[SrcIdx]) {
40325 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
40326 continue;
40327 }
40328
40329 APInt &Val = EltBits[SrcIdx];
40330 if (IsSigned) {
40331 // PACKSS: Truncate signed value with signed saturation.
40332 // Source values less than dst minint are saturated to minint.
40333 // Source values greater than dst maxint are saturated to maxint.
40334 if (Val.isSignedIntN(DstBitsPerElt))
40335 Val = Val.trunc(DstBitsPerElt);
40336 else if (Val.isNegative())
40337 Val = APInt::getSignedMinValue(DstBitsPerElt);
40338 else
40339 Val = APInt::getSignedMaxValue(DstBitsPerElt);
40340 } else {
40341 // PACKUS: Truncate signed value with unsigned saturation.
40342 // Source values less than zero are saturated to zero.
40343 // Source values greater than dst maxuint are saturated to maxuint.
40344 if (Val.isIntN(DstBitsPerElt))
40345 Val = Val.trunc(DstBitsPerElt);
40346 else if (Val.isNegative())
40347 Val = APInt::getNullValue(DstBitsPerElt);
40348 else
40349 Val = APInt::getAllOnesValue(DstBitsPerElt);
40350 }
40351 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
40352 }
40353 }
40354
40355 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
40356 }
40357
40358 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
40359 // truncate to create a larger truncate.
40360 if (Subtarget.hasAVX512() &&
40361 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
40362 N0.getOperand(0).getValueType() == MVT::v8i32) {
40363 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
40364 (!IsSigned &&
40365 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
40366 if (Subtarget.hasVLX())
40367 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
40368
40369 // Widen input to v16i32 so we can truncate that.
40370 SDLoc dl(N);
40371 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
40372 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
40373 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
40374 }
40375 }
40376
40377 // Attempt to combine as shuffle.
40378 SDValue Op(N, 0);
40379 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40380 return Res;
40381
40382 return SDValue();
40383}
40384
40385static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
40386 TargetLowering::DAGCombinerInfo &DCI,
40387 const X86Subtarget &Subtarget) {
40388 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40390, __PRETTY_FUNCTION__))
40389 X86ISD::VSRL == N->getOpcode()) &&(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40390, __PRETTY_FUNCTION__))
40390 "Unexpected shift opcode")(((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->
getOpcode() || X86ISD::VSRL == N->getOpcode()) && "Unexpected shift opcode"
) ? static_cast<void> (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40390, __PRETTY_FUNCTION__))
;
40391 EVT VT = N->getValueType(0);
40392 SDValue N0 = N->getOperand(0);
40393 SDValue N1 = N->getOperand(1);
40394
40395 // Shift zero -> zero.
40396 if (ISD::isBuildVectorAllZeros(N0.getNode()))
40397 return DAG.getConstant(0, SDLoc(N), VT);
40398
40399 // Detect constant shift amounts.
40400 APInt UndefElts;
40401 SmallVector<APInt, 32> EltBits;
40402 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {
40403 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
40404 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
40405 EltBits[0].getZExtValue(), DAG);
40406 }
40407
40408 APInt KnownUndef, KnownZero;
40409 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40410 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
40411 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
40412 KnownZero, DCI))
40413 return SDValue(N, 0);
40414
40415 return SDValue();
40416}
40417
40418static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
40419 TargetLowering::DAGCombinerInfo &DCI,
40420 const X86Subtarget &Subtarget) {
40421 unsigned Opcode = N->getOpcode();
40422 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40424, __PRETTY_FUNCTION__))
40423 X86ISD::VSRLI == Opcode) &&(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40424, __PRETTY_FUNCTION__))
40424 "Unexpected shift opcode")(((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD
::VSRLI == Opcode) && "Unexpected shift opcode") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40424, __PRETTY_FUNCTION__))
;
40425 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
40426 EVT VT = N->getValueType(0);
40427 SDValue N0 = N->getOperand(0);
40428 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
40429 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0
&& "Unexpected value type") ? static_cast<void>
(0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40430, __PRETTY_FUNCTION__))
40430 "Unexpected value type")((VT == N0.getValueType() && (NumBitsPerElt % 8) == 0
&& "Unexpected value type") ? static_cast<void>
(0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40430, __PRETTY_FUNCTION__))
;
40431 assert(N->getOperand(1).getValueType() == MVT::i8 &&((N->getOperand(1).getValueType() == MVT::i8 && "Unexpected shift amount type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40432, __PRETTY_FUNCTION__))
40432 "Unexpected shift amount type")((N->getOperand(1).getValueType() == MVT::i8 && "Unexpected shift amount type"
) ? static_cast<void> (0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40432, __PRETTY_FUNCTION__))
;
40433
40434 // Out of range logical bit shifts are guaranteed to be zero.
40435 // Out of range arithmetic bit shifts splat the sign bit.
40436 unsigned ShiftVal = N->getConstantOperandVal(1);
40437 if (ShiftVal >= NumBitsPerElt) {
40438 if (LogicalShift)
40439 return DAG.getConstant(0, SDLoc(N), VT);
40440 else
40441 ShiftVal = NumBitsPerElt - 1;
40442 }
40443
40444 // Shift N0 by zero -> N0.
40445 if (!ShiftVal)
40446 return N0;
40447
40448 // Shift zero -> zero.
40449 if (ISD::isBuildVectorAllZeros(N0.getNode()))
40450 return DAG.getConstant(0, SDLoc(N), VT);
40451
40452 // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
40453 // clamped to (NumBitsPerElt - 1).
40454 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
40455 unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
40456 unsigned NewShiftVal = ShiftVal + ShiftVal2;
40457 if (NewShiftVal >= NumBitsPerElt)
40458 NewShiftVal = NumBitsPerElt - 1;
40459 return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
40460 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
40461 }
40462
40463 // We can decode 'whole byte' logical bit shifts as shuffles.
40464 if (LogicalShift && (ShiftVal % 8) == 0) {
40465 SDValue Op(N, 0);
40466 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40467 return Res;
40468 }
40469
40470 // Constant Folding.
40471 APInt UndefElts;
40472 SmallVector<APInt, 32> EltBits;
40473 if (N->isOnlyUserOf(N0.getNode()) &&
40474 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
40475 assert(EltBits.size() == VT.getVectorNumElements() &&((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"
) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40476, __PRETTY_FUNCTION__))
40476 "Unexpected shift value type")((EltBits.size() == VT.getVectorNumElements() && "Unexpected shift value type"
) ? static_cast<void> (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40476, __PRETTY_FUNCTION__))
;
40477 for (APInt &Elt : EltBits) {
40478 if (X86ISD::VSHLI == Opcode)
40479 Elt <<= ShiftVal;
40480 else if (X86ISD::VSRAI == Opcode)
40481 Elt.ashrInPlace(ShiftVal);
40482 else
40483 Elt.lshrInPlace(ShiftVal);
40484 }
40485 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
40486 }
40487
40488 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40489 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
40490 APInt::getAllOnesValue(NumBitsPerElt), DCI))
40491 return SDValue(N, 0);
40492
40493 return SDValue();
40494}
40495
40496static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
40497 TargetLowering::DAGCombinerInfo &DCI,
40498 const X86Subtarget &Subtarget) {
40499 EVT VT = N->getValueType(0);
40500 assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16)) && "Unexpected vector insertion") ? static_cast
<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40502, __PRETTY_FUNCTION__))
40501 (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16)) && "Unexpected vector insertion") ? static_cast
<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40502, __PRETTY_FUNCTION__))
40502 "Unexpected vector insertion")((((N->getOpcode() == X86ISD::PINSRB && VT == MVT::
v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT ==
MVT::v8i16)) && "Unexpected vector insertion") ? static_cast
<void> (0) : __assert_fail ("((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && \"Unexpected vector insertion\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40502, __PRETTY_FUNCTION__))
;
40503
40504 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
40505 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40506 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
40507 APInt::getAllOnesValue(NumBitsPerElt), DCI))
40508 return SDValue(N, 0);
40509
40510 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
40511 SDValue Op(N, 0);
40512 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
40513 return Res;
40514
40515 return SDValue();
40516}
40517
40518/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
40519/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
40520/// OR -> CMPNEQSS.
40521static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
40522 TargetLowering::DAGCombinerInfo &DCI,
40523 const X86Subtarget &Subtarget) {
40524 unsigned opcode;
40525
40526 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
40527 // we're requiring SSE2 for both.
40528 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
40529 SDValue N0 = N->getOperand(0);
40530 SDValue N1 = N->getOperand(1);
40531 SDValue CMP0 = N0.getOperand(1);
40532 SDValue CMP1 = N1.getOperand(1);
40533 SDLoc DL(N);
40534
40535 // The SETCCs should both refer to the same CMP.
40536 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
40537 return SDValue();
40538
40539 SDValue CMP00 = CMP0->getOperand(0);
40540 SDValue CMP01 = CMP0->getOperand(1);
40541 EVT VT = CMP00.getValueType();
40542
40543 if (VT == MVT::f32 || VT == MVT::f64) {
40544 bool ExpectingFlags = false;
40545 // Check for any users that want flags:
40546 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
40547 !ExpectingFlags && UI != UE; ++UI)
40548 switch (UI->getOpcode()) {
40549 default:
40550 case ISD::BR_CC:
40551 case ISD::BRCOND:
40552 case ISD::SELECT:
40553 ExpectingFlags = true;
40554 break;
40555 case ISD::CopyToReg:
40556 case ISD::SIGN_EXTEND:
40557 case ISD::ZERO_EXTEND:
40558 case ISD::ANY_EXTEND:
40559 break;
40560 }
40561
40562 if (!ExpectingFlags) {
40563 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
40564 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
40565
40566 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
40567 X86::CondCode tmp = cc0;
40568 cc0 = cc1;
40569 cc1 = tmp;
40570 }
40571
40572 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
40573 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
40574 // FIXME: need symbolic constants for these magic numbers.
40575 // See X86ATTInstPrinter.cpp:printSSECC().
40576 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
40577 if (Subtarget.hasAVX512()) {
40578 SDValue FSetCC =
40579 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
40580 DAG.getTargetConstant(x86cc, DL, MVT::i8));
40581 // Need to fill with zeros to ensure the bitcast will produce zeroes
40582 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
40583 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
40584 DAG.getConstant(0, DL, MVT::v16i1),
40585 FSetCC, DAG.getIntPtrConstant(0, DL));
40586 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
40587 N->getSimpleValueType(0));
40588 }
40589 SDValue OnesOrZeroesF =
40590 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
40591 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
40592
40593 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
40594 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
40595
40596 if (is64BitFP && !Subtarget.is64Bit()) {
40597 // On a 32-bit target, we cannot bitcast the 64-bit float to a
40598 // 64-bit integer, since that's not a legal type. Since
40599 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
40600 // bits, but can do this little dance to extract the lowest 32 bits
40601 // and work with those going forward.
40602 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
40603 OnesOrZeroesF);
40604 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
40605 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
40606 Vector32, DAG.getIntPtrConstant(0, DL));
40607 IntVT = MVT::i32;
40608 }
40609
40610 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
40611 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
40612 DAG.getConstant(1, DL, IntVT));
40613 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
40614 ANDed);
40615 return OneBitOfTruth;
40616 }
40617 }
40618 }
40619 }
40620 return SDValue();
40621}
40622
40623/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
40624static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
40625 assert(N->getOpcode() == ISD::AND)((N->getOpcode() == ISD::AND) ? static_cast<void> (0
) : __assert_fail ("N->getOpcode() == ISD::AND", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40625, __PRETTY_FUNCTION__))
;
40626
40627 MVT VT = N->getSimpleValueType(0);
40628 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
40629 return SDValue();
40630
40631 SDValue X, Y;
40632 SDValue N0 = N->getOperand(0);
40633 SDValue N1 = N->getOperand(1);
40634
40635 if (SDValue Not = IsNOT(N0, DAG)) {
40636 X = Not;
40637 Y = N1;
40638 } else if (SDValue Not = IsNOT(N1, DAG)) {
40639 X = Not;
40640 Y = N0;
40641 } else
40642 return SDValue();
40643
40644 X = DAG.getBitcast(VT, X);
40645 Y = DAG.getBitcast(VT, Y);
40646 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
40647}
40648
40649// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
40650// logical operations, like in the example below.
40651// or (and (truncate x, truncate y)),
40652// (xor (truncate z, build_vector (constants)))
40653// Given a target type \p VT, we generate
40654// or (and x, y), (xor z, zext(build_vector (constants)))
40655// given x, y and z are of type \p VT. We can do so, if operands are either
40656// truncates from VT types, the second operand is a vector of constants or can
40657// be recursively promoted.
40658static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
40659 unsigned Depth) {
40660 // Limit recursion to avoid excessive compile times.
40661 if (Depth >= SelectionDAG::MaxRecursionDepth)
40662 return SDValue();
40663
40664 if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
40665 N->getOpcode() != ISD::OR)
40666 return SDValue();
40667
40668 SDValue N0 = N->getOperand(0);
40669 SDValue N1 = N->getOperand(1);
40670 SDLoc DL(N);
40671
40672 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40673 if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
40674 return SDValue();
40675
40676 if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
40677 N0 = NN0;
40678 else {
40679 // The Left side has to be a trunc.
40680 if (N0.getOpcode() != ISD::TRUNCATE)
40681 return SDValue();
40682
40683 // The type of the truncated inputs.
40684 if (N0.getOperand(0).getValueType() != VT)
40685 return SDValue();
40686
40687 N0 = N0.getOperand(0);
40688 }
40689
40690 if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
40691 N1 = NN1;
40692 else {
40693 // The right side has to be a 'trunc' or a constant vector.
40694 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
40695 N1.getOperand(0).getValueType() == VT;
40696 if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
40697 return SDValue();
40698
40699 if (RHSTrunc)
40700 N1 = N1.getOperand(0);
40701 else
40702 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
40703 }
40704
40705 return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
40706}
40707
40708// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
40709// register. In most cases we actually compare or select YMM-sized registers
40710// and mixing the two types creates horrible code. This method optimizes
40711// some of the transition sequences.
40712// Even with AVX-512 this is still useful for removing casts around logical
40713// operations on vXi1 mask types.
40714static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
40715 const X86Subtarget &Subtarget) {
40716 EVT VT = N->getValueType(0);
40717 assert(VT.isVector() && "Expected vector type")((VT.isVector() && "Expected vector type") ? static_cast
<void> (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40717, __PRETTY_FUNCTION__))
;
40718
40719 SDLoc DL(N);
40720 assert((N->getOpcode() == ISD::ANY_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40722, __PRETTY_FUNCTION__))
40721 N->getOpcode() == ISD::ZERO_EXTEND ||(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40722, __PRETTY_FUNCTION__))
40722 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() ==
ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) &&
"Invalid Node") ? static_cast<void> (0) : __assert_fail
("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40722, __PRETTY_FUNCTION__))
;
40723
40724 SDValue Narrow = N->getOperand(0);
40725 EVT NarrowVT = Narrow.getValueType();
40726
40727 // Generate the wide operation.
40728 SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
40729 if (!Op)
40730 return SDValue();
40731 switch (N->getOpcode()) {
40732 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40732)
;
40733 case ISD::ANY_EXTEND:
40734 return Op;
40735 case ISD::ZERO_EXTEND:
40736 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
40737 case ISD::SIGN_EXTEND:
40738 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
40739 Op, DAG.getValueType(NarrowVT));
40740 }
40741}
40742
40743/// If both input operands of a logic op are being cast from floating point
40744/// types, try to convert this into a floating point logic node to avoid
40745/// unnecessary moves from SSE to integer registers.
40746static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
40747 const X86Subtarget &Subtarget) {
40748 EVT VT = N->getValueType(0);
40749 SDValue N0 = N->getOperand(0);
40750 SDValue N1 = N->getOperand(1);
40751 SDLoc DL(N);
40752
40753 if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST)
40754 return SDValue();
40755
40756 SDValue N00 = N0.getOperand(0);
40757 SDValue N10 = N1.getOperand(0);
40758 EVT N00Type = N00.getValueType();
40759 EVT N10Type = N10.getValueType();
40760
40761 // Ensure that both types are the same and are legal scalar fp types.
40762 if (N00Type != N10Type ||
40763 !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
40764 (Subtarget.hasSSE2() && N00Type == MVT::f64)))
40765 return SDValue();
40766
40767 unsigned FPOpcode;
40768 switch (N->getOpcode()) {
40769 default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40769)
;
40770 case ISD::AND: FPOpcode = X86ISD::FAND; break;
40771 case ISD::OR: FPOpcode = X86ISD::FOR; break;
40772 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
40773 }
40774
40775 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
40776 return DAG.getBitcast(VT, FPLogic);
40777}
40778
40779/// If this is a zero/all-bits result that is bitwise-anded with a low bits
40780/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
40781/// with a shift-right to eliminate loading the vector constant mask value.
40782static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
40783 const X86Subtarget &Subtarget) {
40784 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
40785 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
40786 EVT VT0 = Op0.getValueType();
40787 EVT VT1 = Op1.getValueType();
40788
40789 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
40790 return SDValue();
40791
40792 APInt SplatVal;
40793 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
40794 !SplatVal.isMask())
40795 return SDValue();
40796
40797 // Don't prevent creation of ANDN.
40798 if (isBitwiseNot(Op0))
40799 return SDValue();
40800
40801 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
40802 return SDValue();
40803
40804 unsigned EltBitWidth = VT0.getScalarSizeInBits();
40805 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
40806 return SDValue();
40807
40808 SDLoc DL(N);
40809 unsigned ShiftVal = SplatVal.countTrailingOnes();
40810 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
40811 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
40812 return DAG.getBitcast(N->getValueType(0), Shift);
40813}
40814
40815// Get the index node from the lowered DAG of a GEP IR instruction with one
40816// indexing dimension.
40817static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
40818 if (Ld->isIndexed())
40819 return SDValue();
40820
40821 SDValue Base = Ld->getBasePtr();
40822
40823 if (Base.getOpcode() != ISD::ADD)
40824 return SDValue();
40825
40826 SDValue ShiftedIndex = Base.getOperand(0);
40827
40828 if (ShiftedIndex.getOpcode() != ISD::SHL)
40829 return SDValue();
40830
40831 return ShiftedIndex.getOperand(0);
40832
40833}
40834
40835static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
40836 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
40837 switch (VT.getSizeInBits()) {
40838 default: return false;
40839 case 64: return Subtarget.is64Bit() ? true : false;
40840 case 32: return true;
40841 }
40842 }
40843 return false;
40844}
40845
40846// This function recognizes cases where X86 bzhi instruction can replace and
40847// 'and-load' sequence.
40848// In case of loading integer value from an array of constants which is defined
40849// as follows:
40850//
40851// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
40852//
40853// then applying a bitwise and on the result with another input.
40854// It's equivalent to performing bzhi (zero high bits) on the input, with the
40855// same index of the load.
40856static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
40857 const X86Subtarget &Subtarget) {
40858 MVT VT = Node->getSimpleValueType(0);
40859 SDLoc dl(Node);
40860
40861 // Check if subtarget has BZHI instruction for the node's type
40862 if (!hasBZHI(Subtarget, VT))
40863 return SDValue();
40864
40865 // Try matching the pattern for both operands.
40866 for (unsigned i = 0; i < 2; i++) {
40867 SDValue N = Node->getOperand(i);
40868 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
40869
40870 // continue if the operand is not a load instruction
40871 if (!Ld)
40872 return SDValue();
40873
40874 const Value *MemOp = Ld->getMemOperand()->getValue();
40875
40876 if (!MemOp)
40877 return SDValue();
40878
40879 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
40880 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
40881 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
40882
40883 Constant *Init = GV->getInitializer();
40884 Type *Ty = Init->getType();
40885 if (!isa<ConstantDataArray>(Init) ||
40886 !Ty->getArrayElementType()->isIntegerTy() ||
40887 Ty->getArrayElementType()->getScalarSizeInBits() !=
40888 VT.getSizeInBits() ||
40889 Ty->getArrayNumElements() >
40890 Ty->getArrayElementType()->getScalarSizeInBits())
40891 continue;
40892
40893 // Check if the array's constant elements are suitable to our case.
40894 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
40895 bool ConstantsMatch = true;
40896 for (uint64_t j = 0; j < ArrayElementCount; j++) {
40897 ConstantInt *Elem =
40898 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
40899 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
40900 ConstantsMatch = false;
40901 break;
40902 }
40903 }
40904 if (!ConstantsMatch)
40905 continue;
40906
40907 // Do the transformation (For 32-bit type):
40908 // -> (and (load arr[idx]), inp)
40909 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
40910 // that will be replaced with one bzhi instruction.
40911 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
40912 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
40913
40914 // Get the Node which indexes into the array.
40915 SDValue Index = getIndexFromUnindexedLoad(Ld);
40916 if (!Index)
40917 return SDValue();
40918 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
40919
40920 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
40921 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
40922
40923 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
40924 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
40925
40926 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
40927 }
40928 }
40929 }
40930 }
40931 return SDValue();
40932}
40933
40934// Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
40935// Turn it into series of XORs and a setnp.
40936static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
40937 const X86Subtarget &Subtarget) {
40938 EVT VT = N->getValueType(0);
40939
40940 // We only support 64-bit and 32-bit. 64-bit requires special handling
40941 // unless the 64-bit popcnt instruction is legal.
40942 if (VT != MVT::i32 && VT != MVT::i64)
40943 return SDValue();
40944
40945 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40946 if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
40947 return SDValue();
40948
40949 SDValue N0 = N->getOperand(0);
40950 SDValue N1 = N->getOperand(1);
40951
40952 // LHS needs to be a single use CTPOP.
40953 if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
40954 return SDValue();
40955
40956 // RHS needs to be 1.
40957 if (!isOneConstant(N1))
40958 return SDValue();
40959
40960 SDLoc DL(N);
40961 SDValue X = N0.getOperand(0);
40962
40963 // If this is 64-bit, its always best to xor the two 32-bit pieces together
40964 // even if we have popcnt.
40965 if (VT == MVT::i64) {
40966 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
40967 DAG.getNode(ISD::SRL, DL, VT, X,
40968 DAG.getConstant(32, DL, MVT::i8)));
40969 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
40970 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
40971 // Generate a 32-bit parity idiom. This will bring us back here if we need
40972 // to expand it too.
40973 SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
40974 DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
40975 DAG.getConstant(1, DL, MVT::i32));
40976 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
40977 }
40978 assert(VT == MVT::i32 && "Unexpected VT!")((VT == MVT::i32 && "Unexpected VT!") ? static_cast<
void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 40978, __PRETTY_FUNCTION__))
;
40979
40980 // Xor the high and low 16-bits together using a 32-bit operation.
40981 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
40982 DAG.getConstant(16, DL, MVT::i8));
40983 X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
40984
40985 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
40986 // This should allow an h-reg to be used to save a shift.
40987 // FIXME: We only get an h-reg in 32-bit mode.
40988 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
40989 DAG.getNode(ISD::SRL, DL, VT, X,
40990 DAG.getConstant(8, DL, MVT::i8)));
40991 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
40992 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
40993 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
40994
40995 // Copy the inverse of the parity flag into a register with setcc.
40996 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
40997 // Zero extend to original type.
40998 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
40999}
41000
41001
41002// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
41003// Where C is a mask containing the same number of bits as the setcc and
41004// where the setcc will freely 0 upper bits of k-register. We can replace the
41005// undef in the concat with 0s and remove the AND. This mainly helps with
41006// v2i1/v4i1 setcc being casted to scalar.
41007static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
41008 const X86Subtarget &Subtarget) {
41009 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")((N->getOpcode() == ISD::AND && "Unexpected opcode!"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41009, __PRETTY_FUNCTION__))
;
41010
41011 EVT VT = N->getValueType(0);
41012
41013 // Make sure this is an AND with constant. We will check the value of the
41014 // constant later.
41015 if (!isa<ConstantSDNode>(N->getOperand(1)))
41016 return SDValue();
41017
41018 // This is implied by the ConstantSDNode.
41019 assert(!VT.isVector() && "Expected scalar VT!")((!VT.isVector() && "Expected scalar VT!") ? static_cast
<void> (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41019, __PRETTY_FUNCTION__))
;
41020
41021 if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
41022 !N->getOperand(0).hasOneUse() ||
41023 !N->getOperand(0).getOperand(0).hasOneUse())
41024 return SDValue();
41025
41026 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41027 SDValue Src = N->getOperand(0).getOperand(0);
41028 EVT SrcVT = Src.getValueType();
41029 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
41030 !TLI.isTypeLegal(SrcVT))
41031 return SDValue();
41032
41033 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
41034 return SDValue();
41035
41036 // We only care about the first subvector of the concat, we expect the
41037 // other subvectors to be ignored due to the AND if we make the change.
41038 SDValue SubVec = Src.getOperand(0);
41039 EVT SubVecVT = SubVec.getValueType();
41040
41041 // First subvector should be a setcc with a legal result type. The RHS of the
41042 // AND should be a mask with this many bits.
41043 if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
41044 !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
41045 return SDValue();
41046
41047 EVT SetccVT = SubVec.getOperand(0).getValueType();
41048 if (!TLI.isTypeLegal(SetccVT) ||
41049 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
41050 return SDValue();
41051
41052 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
41053 return SDValue();
41054
41055 // We passed all the checks. Rebuild the concat_vectors with zeroes
41056 // and cast it back to VT.
41057 SDLoc dl(N);
41058 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
41059 DAG.getConstant(0, dl, SubVecVT));
41060 Ops[0] = SubVec;
41061 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
41062 Ops);
41063 return DAG.getBitcast(VT, Concat);
41064}
41065
41066static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
41067 TargetLowering::DAGCombinerInfo &DCI,
41068 const X86Subtarget &Subtarget) {
41069 EVT VT = N->getValueType(0);
41070
41071 // If this is SSE1 only convert to FAND to avoid scalarization.
41072 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
41073 return DAG.getBitcast(
41074 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
41075 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
41076 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
41077 }
41078
41079 // Use a 32-bit and+zext if upper bits known zero.
41080 if (VT == MVT::i64 && Subtarget.is64Bit() &&
41081 !isa<ConstantSDNode>(N->getOperand(1))) {
41082 APInt HiMask = APInt::getHighBitsSet(64, 32);
41083 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
41084 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
41085 SDLoc dl(N);
41086 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
41087 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
41088 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
41089 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
41090 }
41091 }
41092
41093 // This must be done before legalization has expanded the ctpop.
41094 if (SDValue V = combineParity(N, DAG, Subtarget))
41095 return V;
41096
41097 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
41098 // TODO: Support multiple SrcOps.
41099 if (VT == MVT::i1) {
41100 SmallVector<SDValue, 2> SrcOps;
41101 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
41102 SrcOps.size() == 1) {
41103 SDLoc dl(N);
41104 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41105 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
41106 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
41107 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
41108 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
41109 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
41110 if (Mask) {
41111 APInt AllBits = APInt::getAllOnesValue(NumElts);
41112 return DAG.getSetCC(dl, MVT::i1, Mask,
41113 DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
41114 }
41115 }
41116 }
41117
41118 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
41119 return V;
41120
41121 if (DCI.isBeforeLegalizeOps())
41122 return SDValue();
41123
41124 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
41125 return R;
41126
41127 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
41128 return FPLogic;
41129
41130 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
41131 return R;
41132
41133 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
41134 return ShiftRight;
41135
41136 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
41137 return R;
41138
41139 // Attempt to recursively combine a bitmask AND with shuffles.
41140 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
41141 SDValue Op(N, 0);
41142 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
41143 return Res;
41144 }
41145
41146 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
41147 if ((VT.getScalarSizeInBits() % 8) == 0 &&
41148 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
41149 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
41150 SDValue BitMask = N->getOperand(1);
41151 SDValue SrcVec = N->getOperand(0).getOperand(0);
41152 EVT SrcVecVT = SrcVec.getValueType();
41153
41154 // Check that the constant bitmask masks whole bytes.
41155 APInt UndefElts;
41156 SmallVector<APInt, 64> EltBits;
41157 if (VT == SrcVecVT.getScalarType() &&
41158 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
41159 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
41160 llvm::all_of(EltBits, [](APInt M) {
41161 return M.isNullValue() || M.isAllOnesValue();
41162 })) {
41163 unsigned NumElts = SrcVecVT.getVectorNumElements();
41164 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
41165 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
41166
41167 // Create a root shuffle mask from the byte mask and the extracted index.
41168 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
41169 for (unsigned i = 0; i != Scale; ++i) {
41170 if (UndefElts[i])
41171 continue;
41172 int VecIdx = Scale * Idx + i;
41173 ShuffleMask[VecIdx] =
41174 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
41175 }
41176
41177 if (SDValue Shuffle = combineX86ShufflesRecursively(
41178 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
41179 /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
41180 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
41181 N->getOperand(0).getOperand(1));
41182 }
41183 }
41184
41185 return SDValue();
41186}
41187
41188// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
41189static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
41190 const X86Subtarget &Subtarget) {
41191 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41191, __PRETTY_FUNCTION__))
;
41192
41193 MVT VT = N->getSimpleValueType(0);
41194 if (!VT.isVector() || (VT.getScalarSizeInBits() % 8) != 0)
41195 return SDValue();
41196
41197 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
41198 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
41199 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
41200 return SDValue();
41201
41202 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
41203 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
41204 bool UseVPTERNLOG = (Subtarget.hasAVX512() && VT.is512BitVector()) ||
41205 Subtarget.hasVLX();
41206 if (!(Subtarget.hasXOP() || UseVPTERNLOG ||
41207 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
41208 return SDValue();
41209
41210 // Attempt to extract constant byte masks.
41211 APInt UndefElts0, UndefElts1;
41212 SmallVector<APInt, 32> EltBits0, EltBits1;
41213 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
41214 false, false))
41215 return SDValue();
41216 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
41217 false, false))
41218 return SDValue();
41219
41220 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
41221 // TODO - add UNDEF elts support.
41222 if (UndefElts0[i] || UndefElts1[i])
41223 return SDValue();
41224 if (EltBits0[i] != ~EltBits1[i])
41225 return SDValue();
41226 }
41227
41228 SDLoc DL(N);
41229 SDValue X = N->getOperand(0);
41230 SDValue Y =
41231 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
41232 DAG.getBitcast(VT, N1.getOperand(0)));
41233 return DAG.getNode(ISD::OR, DL, VT, X, Y);
41234}
41235
41236// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
41237static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
41238 if (N->getOpcode() != ISD::OR)
41239 return false;
41240
41241 SDValue N0 = N->getOperand(0);
41242 SDValue N1 = N->getOperand(1);
41243
41244 // Canonicalize AND to LHS.
41245 if (N1.getOpcode() == ISD::AND)
41246 std::swap(N0, N1);
41247
41248 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
41249 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
41250 return false;
41251
41252 Mask = N1.getOperand(0);
41253 X = N1.getOperand(1);
41254
41255 // Check to see if the mask appeared in both the AND and ANDNP.
41256 if (N0.getOperand(0) == Mask)
41257 Y = N0.getOperand(1);
41258 else if (N0.getOperand(1) == Mask)
41259 Y = N0.getOperand(0);
41260 else
41261 return false;
41262
41263 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
41264 // ANDNP combine allows other combines to happen that prevent matching.
41265 return true;
41266}
41267
41268// Try to fold:
41269// (or (and (m, y), (pandn m, x)))
41270// into:
41271// (vselect m, x, y)
41272// As a special case, try to fold:
41273// (or (and (m, (sub 0, x)), (pandn m, x)))
41274// into:
41275// (sub (xor X, M), M)
41276static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
41277 const X86Subtarget &Subtarget) {
41278 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")((N->getOpcode() == ISD::OR && "Unexpected Opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41278, __PRETTY_FUNCTION__))
;
41279
41280 EVT VT = N->getValueType(0);
41281 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
41282 (VT.is256BitVector() && Subtarget.hasInt256())))
41283 return SDValue();
41284
41285 SDValue X, Y, Mask;
41286 if (!matchLogicBlend(N, X, Y, Mask))
41287 return SDValue();
41288
41289 // Validate that X, Y, and Mask are bitcasts, and see through them.
41290 Mask = peekThroughBitcasts(Mask);
41291 X = peekThroughBitcasts(X);
41292 Y = peekThroughBitcasts(Y);
41293
41294 EVT MaskVT = Mask.getValueType();
41295 unsigned EltBits = MaskVT.getScalarSizeInBits();
41296
41297 // TODO: Attempt to handle floating point cases as well?
41298 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
41299 return SDValue();
41300
41301 SDLoc DL(N);
41302
41303 // Attempt to combine to conditional negate: (sub (xor X, M), M)
41304 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
41305 DAG, Subtarget))
41306 return Res;
41307
41308 // PBLENDVB is only available on SSE 4.1.
41309 if (!Subtarget.hasSSE41())
41310 return SDValue();
41311
41312 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
41313
41314 X = DAG.getBitcast(BlendVT, X);
41315 Y = DAG.getBitcast(BlendVT, Y);
41316 Mask = DAG.getBitcast(BlendVT, Mask);
41317 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
41318 return DAG.getBitcast(VT, Mask);
41319}
41320
41321// Helper function for combineOrCmpEqZeroToCtlzSrl
41322// Transforms:
41323// seteq(cmp x, 0)
41324// into:
41325// srl(ctlz x), log2(bitsize(x))
41326// Input pattern is checked by caller.
41327static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
41328 SelectionDAG &DAG) {
41329 SDValue Cmp = Op.getOperand(1);
41330 EVT VT = Cmp.getOperand(0).getValueType();
41331 unsigned Log2b = Log2_32(VT.getSizeInBits());
41332 SDLoc dl(Op);
41333 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
41334 // The result of the shift is true or false, and on X86, the 32-bit
41335 // encoding of shr and lzcnt is more desirable.
41336 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
41337 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
41338 DAG.getConstant(Log2b, dl, MVT::i8));
41339 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
41340}
41341
41342// Try to transform:
41343// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
41344// into:
41345// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
41346// Will also attempt to match more generic cases, eg:
41347// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
41348// Only applies if the target supports the FastLZCNT feature.
41349static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
41350 TargetLowering::DAGCombinerInfo &DCI,
41351 const X86Subtarget &Subtarget) {
41352 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
41353 return SDValue();
41354
41355 auto isORCandidate = [](SDValue N) {
41356 return (N->getOpcode() == ISD::OR && N->hasOneUse());
41357 };
41358
41359 // Check the zero extend is extending to 32-bit or more. The code generated by
41360 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
41361 // instructions to clear the upper bits.
41362 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
41363 !isORCandidate(N->getOperand(0)))
41364 return SDValue();
41365
41366 // Check the node matches: setcc(eq, cmp 0)
41367 auto isSetCCCandidate = [](SDValue N) {
41368 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
41369 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
41370 N->getOperand(1).getOpcode() == X86ISD::CMP &&
41371 isNullConstant(N->getOperand(1).getOperand(1)) &&
41372 N->getOperand(1).getValueType().bitsGE(MVT::i32);
41373 };
41374
41375 SDNode *OR = N->getOperand(0).getNode();
41376 SDValue LHS = OR->getOperand(0);
41377 SDValue RHS = OR->getOperand(1);
41378
41379 // Save nodes matching or(or, setcc(eq, cmp 0)).
41380 SmallVector<SDNode *, 2> ORNodes;
41381 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
41382 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
41383 ORNodes.push_back(OR);
41384 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
41385 LHS = OR->getOperand(0);
41386 RHS = OR->getOperand(1);
41387 }
41388
41389 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
41390 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
41391 !isORCandidate(SDValue(OR, 0)))
41392 return SDValue();
41393
41394 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
41395 // to
41396 // or(srl(ctlz),srl(ctlz)).
41397 // The dag combiner can then fold it into:
41398 // srl(or(ctlz, ctlz)).
41399 EVT VT = OR->getValueType(0);
41400 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
41401 SDValue Ret, NewRHS;
41402 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
41403 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
41404
41405 if (!Ret)
41406 return SDValue();
41407
41408 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
41409 while (ORNodes.size() > 0) {
41410 OR = ORNodes.pop_back_val();
41411 LHS = OR->getOperand(0);
41412 RHS = OR->getOperand(1);
41413 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
41414 if (RHS->getOpcode() == ISD::OR)
41415 std::swap(LHS, RHS);
41416 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
41417 if (!NewRHS)
41418 return SDValue();
41419 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
41420 }
41421
41422 if (Ret)
41423 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
41424
41425 return Ret;
41426}
41427
41428static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,
41429 const X86Subtarget &Subtarget) {
41430 assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node")((N->getOpcode() == ISD::OR && "Expected ISD::OR node"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Expected ISD::OR node\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41430, __PRETTY_FUNCTION__))
;
41431 SDValue N0 = N->getOperand(0);
41432 SDValue N1 = N->getOperand(1);
41433 EVT VT = N->getValueType(0);
41434 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41435
41436 if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||
41437 !TLI.isOperationLegalOrCustom(ISD::FSHR, VT))
41438 return SDValue();
41439
41440 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
41441 bool OptForSize = DAG.shouldOptForSize();
41442 unsigned Bits = VT.getScalarSizeInBits();
41443
41444 // SHLD/SHRD instructions have lower register pressure, but on some
41445 // platforms they have higher latency than the equivalent
41446 // series of shifts/or that would otherwise be generated.
41447 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
41448 // have higher latencies and we are not optimizing for size.
41449 if (!OptForSize && Subtarget.isSHLDSlow())
41450 return SDValue();
41451
41452 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
41453 std::swap(N0, N1);
41454 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
41455 return SDValue();
41456 if (!N0.hasOneUse() || !N1.hasOneUse())
41457 return SDValue();
41458
41459 EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
41460
41461 SDValue ShAmt0 = N0.getOperand(1);
41462 if (ShAmt0.getValueType() != ShiftVT)
41463 return SDValue();
41464 SDValue ShAmt1 = N1.getOperand(1);
41465 if (ShAmt1.getValueType() != ShiftVT)
41466 return SDValue();
41467
41468 // Peek through any modulo shift masks.
41469 SDValue ShMsk0;
41470 if (ShAmt0.getOpcode() == ISD::AND &&
41471 isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
41472 ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
41473 ShMsk0 = ShAmt0;
41474 ShAmt0 = ShAmt0.getOperand(0);
41475 }
41476 SDValue ShMsk1;
41477 if (ShAmt1.getOpcode() == ISD::AND &&
41478 isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
41479 ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
41480 ShMsk1 = ShAmt1;
41481 ShAmt1 = ShAmt1.getOperand(0);
41482 }
41483
41484 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
41485 ShAmt0 = ShAmt0.getOperand(0);
41486 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
41487 ShAmt1 = ShAmt1.getOperand(0);
41488
41489 SDLoc DL(N);
41490 unsigned Opc = ISD::FSHL;
41491 SDValue Op0 = N0.getOperand(0);
41492 SDValue Op1 = N1.getOperand(0);
41493 if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
41494 Opc = ISD::FSHR;
41495 std::swap(Op0, Op1);
41496 std::swap(ShAmt0, ShAmt1);
41497 std::swap(ShMsk0, ShMsk1);
41498 }
41499
41500 auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,
41501 SDValue Amt) {
41502 if (Opc == ISD::FSHR)
41503 std::swap(Op0, Op1);
41504 return DAG.getNode(Opc, DL, VT, Op0, Op1,
41505 DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));
41506 };
41507
41508 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
41509 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
41510 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
41511 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
41512 // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
41513 // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
41514 if (ShAmt1.getOpcode() == ISD::SUB) {
41515 SDValue Sum = ShAmt1.getOperand(0);
41516 if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
41517 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
41518 if (ShAmt1Op1.getOpcode() == ISD::AND &&
41519 isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
41520 ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
41521 ShMsk1 = ShAmt1Op1;
41522 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
41523 }
41524 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
41525 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
41526 if ((SumC->getAPIntValue() == Bits ||
41527 (SumC->getAPIntValue() == 0 && ShMsk1)) &&
41528 ShAmt1Op1 == ShAmt0)
41529 return GetFunnelShift(Op0, Op1, ShAmt0);
41530 }
41531 } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
41532 auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
41533 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
41534 return GetFunnelShift(Op0, Op1, ShAmt0);
41535 } else if (ShAmt1.getOpcode() == ISD::XOR) {
41536 SDValue Mask = ShAmt1.getOperand(1);
41537 if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
41538 unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
41539 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
41540 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
41541 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
41542 if (MaskC->getSExtValue() == (Bits - 1) &&
41543 (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
41544 if (Op1.getOpcode() == InnerShift &&
41545 isa<ConstantSDNode>(Op1.getOperand(1)) &&
41546 Op1.getConstantOperandAPInt(1).isOneValue()) {
41547 return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
41548 }
41549 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
41550 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
41551 Op1.getOperand(0) == Op1.getOperand(1)) {
41552 return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
41553 }
41554 }
41555 }
41556 }
41557
41558 return SDValue();
41559}
41560
41561static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
41562 TargetLowering::DAGCombinerInfo &DCI,
41563 const X86Subtarget &Subtarget) {
41564 SDValue N0 = N->getOperand(0);
41565 SDValue N1 = N->getOperand(1);
41566 EVT VT = N->getValueType(0);
41567
41568 // If this is SSE1 only convert to FOR to avoid scalarization.
41569 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
41570 return DAG.getBitcast(MVT::v4i32,
41571 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
41572 DAG.getBitcast(MVT::v4f32, N0),
41573 DAG.getBitcast(MVT::v4f32, N1)));
41574 }
41575
41576 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
41577 // TODO: Support multiple SrcOps.
41578 if (VT == MVT::i1) {
41579 SmallVector<SDValue, 2> SrcOps;
41580 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
41581 SrcOps.size() == 1) {
41582 SDLoc dl(N);
41583 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41584 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
41585 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
41586 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
41587 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
41588 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
41589 if (Mask) {
41590 APInt AllBits = APInt::getNullValue(NumElts);
41591 return DAG.getSetCC(dl, MVT::i1, Mask,
41592 DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
41593 }
41594 }
41595 }
41596
41597 if (DCI.isBeforeLegalizeOps())
41598 return SDValue();
41599
41600 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
41601 return R;
41602
41603 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
41604 return FPLogic;
41605
41606 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
41607 return R;
41608
41609 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
41610 return R;
41611
41612 if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))
41613 return R;
41614
41615 // Attempt to recursively combine an OR of shuffles.
41616 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
41617 SDValue Op(N, 0);
41618 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
41619 return Res;
41620 }
41621
41622 return SDValue();
41623}
41624
41625/// Try to turn tests against the signbit in the form of:
41626/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
41627/// into:
41628/// SETGT(X, -1)
41629static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
41630 // This is only worth doing if the output type is i8 or i1.
41631 EVT ResultType = N->getValueType(0);
41632 if (ResultType != MVT::i8 && ResultType != MVT::i1)
41633 return SDValue();
41634
41635 SDValue N0 = N->getOperand(0);
41636 SDValue N1 = N->getOperand(1);
41637
41638 // We should be performing an xor against a truncated shift.
41639 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
41640 return SDValue();
41641
41642 // Make sure we are performing an xor against one.
41643 if (!isOneConstant(N1))
41644 return SDValue();
41645
41646 // SetCC on x86 zero extends so only act on this if it's a logical shift.
41647 SDValue Shift = N0.getOperand(0);
41648 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
41649 return SDValue();
41650
41651 // Make sure we are truncating from one of i16, i32 or i64.
41652 EVT ShiftTy = Shift.getValueType();
41653 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
41654 return SDValue();
41655
41656 // Make sure the shift amount extracts the sign bit.
41657 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
41658 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
41659 return SDValue();
41660
41661 // Create a greater-than comparison against -1.
41662 // N.B. Using SETGE against 0 works but we want a canonical looking
41663 // comparison, using SETGT matches up with what TranslateX86CC.
41664 SDLoc DL(N);
41665 SDValue ShiftOp = Shift.getOperand(0);
41666 EVT ShiftOpTy = ShiftOp.getValueType();
41667 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41668 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
41669 *DAG.getContext(), ResultType);
41670 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
41671 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
41672 if (SetCCResultType != ResultType)
41673 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
41674 return Cond;
41675}
41676
41677/// Turn vector tests of the signbit in the form of:
41678/// xor (sra X, elt_size(X)-1), -1
41679/// into:
41680/// pcmpgt X, -1
41681///
41682/// This should be called before type legalization because the pattern may not
41683/// persist after that.
41684static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
41685 const X86Subtarget &Subtarget) {
41686 EVT VT = N->getValueType(0);
41687 if (!VT.isSimple())
41688 return SDValue();
41689
41690 switch (VT.getSimpleVT().SimpleTy) {
41691 default: return SDValue();
41692 case MVT::v16i8:
41693 case MVT::v8i16:
41694 case MVT::v4i32:
41695 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
41696 case MVT::v32i8:
41697 case MVT::v16i16:
41698 case MVT::v8i32:
41699 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
41700 }
41701
41702 // There must be a shift right algebraic before the xor, and the xor must be a
41703 // 'not' operation.
41704 SDValue Shift = N->getOperand(0);
41705 SDValue Ones = N->getOperand(1);
41706 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
41707 !ISD::isBuildVectorAllOnes(Ones.getNode()))
41708 return SDValue();
41709
41710 // The shift should be smearing the sign bit across each vector element.
41711 auto *ShiftAmt =
41712 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
41713 if (!ShiftAmt ||
41714 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
41715 return SDValue();
41716
41717 // Create a greater-than comparison against -1. We don't use the more obvious
41718 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
41719 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
41720}
41721
41722/// Detect patterns of truncation with unsigned saturation:
41723///
41724/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
41725/// Return the source value x to be truncated or SDValue() if the pattern was
41726/// not matched.
41727///
41728/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
41729/// where C1 >= 0 and C2 is unsigned max of destination type.
41730///
41731/// (truncate (smax (smin (x, C2), C1)) to dest_type)
41732/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
41733///
41734/// These two patterns are equivalent to:
41735/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
41736/// So return the smax(x, C1) value to be truncated or SDValue() if the
41737/// pattern was not matched.
41738static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
41739 const SDLoc &DL) {
41740 EVT InVT = In.getValueType();
41741
41742 // Saturation with truncation. We truncate from InVT to VT.
41743 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&((InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation") ? static_cast<
void> (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41744, __PRETTY_FUNCTION__))
41744 "Unexpected types for truncate operation")((InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
"Unexpected types for truncate operation") ? static_cast<
void> (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41744, __PRETTY_FUNCTION__))
;
41745
41746 // Match min/max and return limit value as a parameter.
41747 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
41748 if (V.getOpcode() == Opcode &&
41749 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
41750 return V.getOperand(0);
41751 return SDValue();
41752 };
41753
41754 APInt C1, C2;
41755 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
41756 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
41757 // the element size of the destination type.
41758 if (C2.isMask(VT.getScalarSizeInBits()))
41759 return UMin;
41760
41761 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
41762 if (MatchMinMax(SMin, ISD::SMAX, C1))
41763 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
41764 return SMin;
41765
41766 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
41767 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
41768 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
41769 C2.uge(C1)) {
41770 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
41771 }
41772
41773 return SDValue();
41774}
41775
41776/// Detect patterns of truncation with signed saturation:
41777/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
41778/// signed_max_of_dest_type)) to dest_type)
41779/// or:
41780/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
41781/// signed_min_of_dest_type)) to dest_type).
41782/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
41783/// Return the source value to be truncated or SDValue() if the pattern was not
41784/// matched.
41785static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
41786 unsigned NumDstBits = VT.getScalarSizeInBits();
41787 unsigned NumSrcBits = In.getScalarValueSizeInBits();
41788 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")((NumSrcBits > NumDstBits && "Unexpected types for truncate operation"
) ? static_cast<void> (0) : __assert_fail ("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41788, __PRETTY_FUNCTION__))
;
41789
41790 auto MatchMinMax = [](SDValue V, unsigned Opcode,
41791 const APInt &Limit) -> SDValue {
41792 APInt C;
41793 if (V.getOpcode() == Opcode &&
41794 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
41795 return V.getOperand(0);
41796 return SDValue();
41797 };
41798
41799 APInt SignedMax, SignedMin;
41800 if (MatchPackUS) {
41801 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
41802 SignedMin = APInt(NumSrcBits, 0);
41803 } else {
41804 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
41805 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
41806 }
41807
41808 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
41809 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
41810 return SMax;
41811
41812 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
41813 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
41814 return SMin;
41815
41816 return SDValue();
41817}
41818
41819static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
41820 SelectionDAG &DAG,
41821 const X86Subtarget &Subtarget) {
41822 if (!Subtarget.hasSSE2() || !VT.isVector())
41823 return SDValue();
41824
41825 EVT SVT = VT.getVectorElementType();
41826 EVT InVT = In.getValueType();
41827 EVT InSVT = InVT.getVectorElementType();
41828
41829 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
41830 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
41831 // and concatenate at the same time. Then we can use a final vpmovuswb to
41832 // clip to 0-255.
41833 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
41834 InVT == MVT::v16i32 && VT == MVT::v16i8) {
41835 if (auto USatVal = detectSSatPattern(In, VT, true)) {
41836 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
41837 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
41838 DL, DAG, Subtarget);
41839 assert(Mid && "Failed to pack!")((Mid && "Failed to pack!") ? static_cast<void>
(0) : __assert_fail ("Mid && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41839, __PRETTY_FUNCTION__))
;
41840 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
41841 }
41842 }
41843
41844 // vXi32 truncate instructions are available with AVX512F.
41845 // vXi16 truncate instructions are only available with AVX512BW.
41846 // For 256-bit or smaller vectors, we require VLX.
41847 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
41848 // If the result type is 256-bits or larger and we have disable 512-bit
41849 // registers, we should go ahead and use the pack instructions if possible.
41850 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
41851 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
41852 (InVT.getSizeInBits() > 128) &&
41853 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
41854 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
41855
41856 if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
41857 VT.getSizeInBits() >= 64 &&
41858 (SVT == MVT::i8 || SVT == MVT::i16) &&
41859 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
41860 if (auto USatVal = detectSSatPattern(In, VT, true)) {
41861 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
41862 // Only do this when the result is at least 64 bits or we'll leaving
41863 // dangling PACKSSDW nodes.
41864 if (SVT == MVT::i8 && InSVT == MVT::i32) {
41865 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
41866 VT.getVectorNumElements());
41867 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
41868 DAG, Subtarget);
41869 assert(Mid && "Failed to pack!")((Mid && "Failed to pack!") ? static_cast<void>
(0) : __assert_fail ("Mid && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41869, __PRETTY_FUNCTION__))
;
41870 SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
41871 Subtarget);
41872 assert(V && "Failed to pack!")((V && "Failed to pack!") ? static_cast<void> (
0) : __assert_fail ("V && \"Failed to pack!\"", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41872, __PRETTY_FUNCTION__))
;
41873 return V;
41874 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
41875 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
41876 Subtarget);
41877 }
41878 if (auto SSatVal = detectSSatPattern(In, VT))
41879 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
41880 Subtarget);
41881 }
41882
41883 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41884 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
41885 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
41886 unsigned TruncOpc = 0;
41887 SDValue SatVal;
41888 if (auto SSatVal = detectSSatPattern(In, VT)) {
41889 SatVal = SSatVal;
41890 TruncOpc = X86ISD::VTRUNCS;
41891 } else if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) {
41892 SatVal = USatVal;
41893 TruncOpc = X86ISD::VTRUNCUS;
41894 }
41895 if (SatVal) {
41896 unsigned ResElts = VT.getVectorNumElements();
41897 // If the input type is less than 512 bits and we don't have VLX, we need
41898 // to widen to 512 bits.
41899 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
41900 unsigned NumConcats = 512 / InVT.getSizeInBits();
41901 ResElts *= NumConcats;
41902 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
41903 ConcatOps[0] = SatVal;
41904 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
41905 NumConcats * InVT.getVectorNumElements());
41906 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
41907 }
41908 // Widen the result if its narrower than 128 bits.
41909 if (ResElts * SVT.getSizeInBits() < 128)
41910 ResElts = 128 / SVT.getSizeInBits();
41911 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
41912 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
41913 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
41914 DAG.getIntPtrConstant(0, DL));
41915 }
41916 }
41917
41918 return SDValue();
41919}
41920
41921/// This function detects the AVG pattern between vectors of unsigned i8/i16,
41922/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
41923/// X86ISD::AVG instruction.
41924static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
41925 const X86Subtarget &Subtarget,
41926 const SDLoc &DL) {
41927 if (!VT.isVector())
41928 return SDValue();
41929 EVT InVT = In.getValueType();
41930 unsigned NumElems = VT.getVectorNumElements();
41931
41932 EVT ScalarVT = VT.getVectorElementType();
41933 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
41934 NumElems >= 2 && isPowerOf2_32(NumElems)))
41935 return SDValue();
41936
41937 // InScalarVT is the intermediate type in AVG pattern and it should be greater
41938 // than the original input type (i8/i16).
41939 EVT InScalarVT = InVT.getVectorElementType();
41940 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
41941 return SDValue();
41942
41943 if (!Subtarget.hasSSE2())
41944 return SDValue();
41945
41946 // Detect the following pattern:
41947 //
41948 // %1 = zext <N x i8> %a to <N x i32>
41949 // %2 = zext <N x i8> %b to <N x i32>
41950 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
41951 // %4 = add nuw nsw <N x i32> %3, %2
41952 // %5 = lshr <N x i32> %N, <i32 1 x N>
41953 // %6 = trunc <N x i32> %5 to <N x i8>
41954 //
41955 // In AVX512, the last instruction can also be a trunc store.
41956 if (In.getOpcode() != ISD::SRL)
41957 return SDValue();
41958
41959 // A lambda checking the given SDValue is a constant vector and each element
41960 // is in the range [Min, Max].
41961 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
41962 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
41963 if (!BV || !BV->isConstant())
41964 return false;
41965 for (SDValue Op : V->ops()) {
41966 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
41967 if (!C)
41968 return false;
41969 const APInt &Val = C->getAPIntValue();
41970 if (Val.ult(Min) || Val.ugt(Max))
41971 return false;
41972 }
41973 return true;
41974 };
41975
41976 // Check if each element of the vector is right-shifted by one.
41977 auto LHS = In.getOperand(0);
41978 auto RHS = In.getOperand(1);
41979 if (!IsConstVectorInRange(RHS, 1, 1))
41980 return SDValue();
41981 if (LHS.getOpcode() != ISD::ADD)
41982 return SDValue();
41983
41984 // Detect a pattern of a + b + 1 where the order doesn't matter.
41985 SDValue Operands[3];
41986 Operands[0] = LHS.getOperand(0);
41987 Operands[1] = LHS.getOperand(1);
41988
41989 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
41990 ArrayRef<SDValue> Ops) {
41991 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
41992 };
41993
41994 // Take care of the case when one of the operands is a constant vector whose
41995 // element is in the range [1, 256].
41996 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
41997 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
41998 Operands[0].getOperand(0).getValueType() == VT) {
41999 // The pattern is detected. Subtract one from the constant vector, then
42000 // demote it and emit X86ISD::AVG instruction.
42001 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
42002 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
42003 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
42004 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
42005 { Operands[0].getOperand(0), Operands[1] },
42006 AVGBuilder);
42007 }
42008
42009 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
42010 // Match the or case only if its 'add-like' - can be replaced by an add.
42011 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
42012 if (ISD::ADD == V.getOpcode()) {
42013 Op0 = V.getOperand(0);
42014 Op1 = V.getOperand(1);
42015 return true;
42016 }
42017 if (ISD::ZERO_EXTEND != V.getOpcode())
42018 return false;
42019 V = V.getOperand(0);
42020 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
42021 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
42022 return false;
42023 Op0 = V.getOperand(0);
42024 Op1 = V.getOperand(1);
42025 return true;
42026 };
42027
42028 SDValue Op0, Op1;
42029 if (FindAddLike(Operands[0], Op0, Op1))
42030 std::swap(Operands[0], Operands[1]);
42031 else if (!FindAddLike(Operands[1], Op0, Op1))
42032 return SDValue();
42033 Operands[2] = Op0;
42034 Operands[1] = Op1;
42035
42036 // Now we have three operands of two additions. Check that one of them is a
42037 // constant vector with ones, and the other two can be promoted from i8/i16.
42038 for (int i = 0; i < 3; ++i) {
42039 if (!IsConstVectorInRange(Operands[i], 1, 1))
42040 continue;
42041 std::swap(Operands[i], Operands[2]);
42042
42043 // Check if Operands[0] and Operands[1] are results of type promotion.
42044 for (int j = 0; j < 2; ++j)
42045 if (Operands[j].getValueType() != VT) {
42046 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
42047 Operands[j].getOperand(0).getValueType() != VT)
42048 return SDValue();
42049 Operands[j] = Operands[j].getOperand(0);
42050 }
42051
42052 // The pattern is detected, emit X86ISD::AVG instruction(s).
42053 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Operands[0], Operands[1]},
42054 AVGBuilder);
42055 }
42056
42057 return SDValue();
42058}
42059
42060static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
42061 TargetLowering::DAGCombinerInfo &DCI,
42062 const X86Subtarget &Subtarget) {
42063 LoadSDNode *Ld = cast<LoadSDNode>(N);
42064 EVT RegVT = Ld->getValueType(0);
42065 EVT MemVT = Ld->getMemoryVT();
42066 SDLoc dl(Ld);
42067 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42068
42069 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
42070 // into two 16-byte operations. Also split non-temporal aligned loads on
42071 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
42072 ISD::LoadExtType Ext = Ld->getExtensionType();
42073 bool Fast;
42074 unsigned Alignment = Ld->getAlignment();
42075 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
42076 Ext == ISD::NON_EXTLOAD &&
42077 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
42078 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
42079 *Ld->getMemOperand(), &Fast) &&
42080 !Fast))) {
42081 unsigned NumElems = RegVT.getVectorNumElements();
42082 if (NumElems < 2)
42083 return SDValue();
42084
42085 unsigned HalfAlign = 16;
42086 SDValue Ptr1 = Ld->getBasePtr();
42087 SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
42088 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
42089 NumElems / 2);
42090 SDValue Load1 =
42091 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
42092 Alignment, Ld->getMemOperand()->getFlags());
42093 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
42094 Ld->getPointerInfo().getWithOffset(HalfAlign),
42095 MinAlign(Alignment, HalfAlign),
42096 Ld->getMemOperand()->getFlags());
42097 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
42098 Load1.getValue(1), Load2.getValue(1));
42099
42100 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
42101 return DCI.CombineTo(N, NewVec, TF, true);
42102 }
42103
42104 // Bool vector load - attempt to cast to an integer, as we have good
42105 // (vXiY *ext(vXi1 bitcast(iX))) handling.
42106 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
42107 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
42108 unsigned NumElts = RegVT.getVectorNumElements();
42109 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
42110 if (TLI.isTypeLegal(IntVT)) {
42111 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
42112 Ld->getPointerInfo(), Alignment,
42113 Ld->getMemOperand()->getFlags());
42114 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
42115 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
42116 }
42117 }
42118
42119 return SDValue();
42120}
42121
42122/// If V is a build vector of boolean constants and exactly one of those
42123/// constants is true, return the operand index of that true element.
42124/// Otherwise, return -1.
42125static int getOneTrueElt(SDValue V) {
42126 // This needs to be a build vector of booleans.
42127 // TODO: Checking for the i1 type matches the IR definition for the mask,
42128 // but the mask check could be loosened to i8 or other types. That might
42129 // also require checking more than 'allOnesValue'; eg, the x86 HW
42130 // instructions only require that the MSB is set for each mask element.
42131 // The ISD::MSTORE comments/definition do not specify how the mask operand
42132 // is formatted.
42133 auto *BV = dyn_cast<BuildVectorSDNode>(V);
42134 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
42135 return -1;
42136
42137 int TrueIndex = -1;
42138 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
42139 for (unsigned i = 0; i < NumElts; ++i) {
42140 const SDValue &Op = BV->getOperand(i);
42141 if (Op.isUndef())
42142 continue;
42143 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
42144 if (!ConstNode)
42145 return -1;
42146 if (ConstNode->getAPIntValue().isAllOnesValue()) {
42147 // If we already found a one, this is too many.
42148 if (TrueIndex >= 0)
42149 return -1;
42150 TrueIndex = i;
42151 }
42152 }
42153 return TrueIndex;
42154}
42155
42156/// Given a masked memory load/store operation, return true if it has one mask
42157/// bit set. If it has one mask bit set, then also return the memory address of
42158/// the scalar element to load/store, the vector index to insert/extract that
42159/// scalar element, and the alignment for the scalar memory access.
42160static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
42161 SelectionDAG &DAG, SDValue &Addr,
42162 SDValue &Index, unsigned &Alignment) {
42163 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
42164 if (TrueMaskElt < 0)
42165 return false;
42166
42167 // Get the address of the one scalar element that is specified by the mask
42168 // using the appropriate offset from the base pointer.
42169 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
42170 Addr = MaskedOp->getBasePtr();
42171 if (TrueMaskElt != 0) {
42172 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
42173 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
42174 }
42175
42176 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
42177 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
42178 return true;
42179}
42180
42181/// If exactly one element of the mask is set for a non-extending masked load,
42182/// it is a scalar load and vector insert.
42183/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
42184/// mask have already been optimized in IR, so we don't bother with those here.
42185static SDValue
42186reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
42187 TargetLowering::DAGCombinerInfo &DCI) {
42188 assert(ML->isUnindexed() && "Unexpected indexed masked load!")((ML->isUnindexed() && "Unexpected indexed masked load!"
) ? static_cast<void> (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42188, __PRETTY_FUNCTION__))
;
42189 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
42190 // However, some target hooks may need to be added to know when the transform
42191 // is profitable. Endianness would also have to be considered.
42192
42193 SDValue Addr, VecIndex;
42194 unsigned Alignment;
42195 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
42196 return SDValue();
42197
42198 // Load the one scalar element that is specified by the mask using the
42199 // appropriate offset from the base pointer.
42200 SDLoc DL(ML);
42201 EVT VT = ML->getValueType(0);
42202 EVT EltVT = VT.getVectorElementType();
42203 SDValue Load =
42204 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
42205 Alignment, ML->getMemOperand()->getFlags());
42206
42207 // Insert the loaded element into the appropriate place in the vector.
42208 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
42209 ML->getPassThru(), Load, VecIndex);
42210 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
42211}
42212
42213static SDValue
42214combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
42215 TargetLowering::DAGCombinerInfo &DCI) {
42216 assert(ML->isUnindexed() && "Unexpected indexed masked load!")((ML->isUnindexed() && "Unexpected indexed masked load!"
) ? static_cast<void> (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42216, __PRETTY_FUNCTION__))
;
42217 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
42218 return SDValue();
42219
42220 SDLoc DL(ML);
42221 EVT VT = ML->getValueType(0);
42222
42223 // If we are loading the first and last elements of a vector, it is safe and
42224 // always faster to load the whole vector. Replace the masked load with a
42225 // vector load and select.
42226 unsigned NumElts = VT.getVectorNumElements();
42227 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
42228 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
42229 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
42230 if (LoadFirstElt && LoadLastElt) {
42231 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
42232 ML->getMemOperand());
42233 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
42234 ML->getPassThru());
42235 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
42236 }
42237
42238 // Convert a masked load with a constant mask into a masked load and a select.
42239 // This allows the select operation to use a faster kind of select instruction
42240 // (for example, vblendvps -> vblendps).
42241
42242 // Don't try this if the pass-through operand is already undefined. That would
42243 // cause an infinite loop because that's what we're about to create.
42244 if (ML->getPassThru().isUndef())
42245 return SDValue();
42246
42247 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
42248 return SDValue();
42249
42250 // The new masked load has an undef pass-through operand. The select uses the
42251 // original pass-through operand.
42252 SDValue NewML = DAG.getMaskedLoad(
42253 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
42254 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
42255 ML->getAddressingMode(), ML->getExtensionType());
42256 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
42257 ML->getPassThru());
42258
42259 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
42260}
42261
42262static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
42263 TargetLowering::DAGCombinerInfo &DCI,
42264 const X86Subtarget &Subtarget) {
42265 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
42266
42267 // TODO: Expanding load with constant mask may be optimized as well.
42268 if (Mld->isExpandingLoad())
42269 return SDValue();
42270
42271 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
42272 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
42273 return ScalarLoad;
42274 // TODO: Do some AVX512 subsets benefit from this transform?
42275 if (!Subtarget.hasAVX512())
42276 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
42277 return Blend;
42278 }
42279
42280 return SDValue();
42281}
42282
42283/// If exactly one element of the mask is set for a non-truncating masked store,
42284/// it is a vector extract and scalar store.
42285/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
42286/// mask have already been optimized in IR, so we don't bother with those here.
42287static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
42288 SelectionDAG &DAG) {
42289 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
42290 // However, some target hooks may need to be added to know when the transform
42291 // is profitable. Endianness would also have to be considered.
42292
42293 SDValue Addr, VecIndex;
42294 unsigned Alignment;
42295 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
42296 return SDValue();
42297
42298 // Extract the one scalar element that is actually being stored.
42299 SDLoc DL(MS);
42300 EVT VT = MS->getValue().getValueType();
42301 EVT EltVT = VT.getVectorElementType();
42302 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
42303 MS->getValue(), VecIndex);
42304
42305 // Store that element at the appropriate offset from the base pointer.
42306 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
42307 Alignment, MS->getMemOperand()->getFlags());
42308}
42309
42310static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
42311 TargetLowering::DAGCombinerInfo &DCI,
42312 const X86Subtarget &Subtarget) {
42313 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
42314 if (Mst->isCompressingStore())
42315 return SDValue();
42316
42317 EVT VT = Mst->getValue().getValueType();
42318 SDLoc dl(Mst);
42319 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42320
42321 if (Mst->isTruncatingStore())
42322 return SDValue();
42323
42324 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
42325 return ScalarStore;
42326
42327 // If the mask value has been legalized to a non-boolean vector, try to
42328 // simplify ops leading up to it. We only demand the MSB of each lane.
42329 SDValue Mask = Mst->getMask();
42330 if (Mask.getScalarValueSizeInBits() != 1) {
42331 APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
42332 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
42333 if (N->getOpcode() != ISD::DELETED_NODE)
42334 DCI.AddToWorklist(N);
42335 return SDValue(N, 0);
42336 }
42337 if (SDValue NewMask =
42338 TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
42339 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
42340 Mst->getBasePtr(), Mst->getOffset(), NewMask,
42341 Mst->getMemoryVT(), Mst->getMemOperand(),
42342 Mst->getAddressingMode());
42343 }
42344
42345 SDValue Value = Mst->getValue();
42346 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
42347 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
42348 Mst->getMemoryVT())) {
42349 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
42350 Mst->getBasePtr(), Mst->getOffset(), Mask,
42351 Mst->getMemoryVT(), Mst->getMemOperand(),
42352 Mst->getAddressingMode(), true);
42353 }
42354
42355 return SDValue();
42356}
42357
42358static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
42359 TargetLowering::DAGCombinerInfo &DCI,
42360 const X86Subtarget &Subtarget) {
42361 StoreSDNode *St = cast<StoreSDNode>(N);
42362 EVT StVT = St->getMemoryVT();
42363 SDLoc dl(St);
42364 unsigned Alignment = St->getAlignment();
42365 SDValue StoredVal = St->getValue();
42366 EVT VT = StoredVal.getValueType();
42367 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42368
42369 // Convert a store of vXi1 into a store of iX and a bitcast.
42370 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
42371 VT.getVectorElementType() == MVT::i1) {
42372
42373 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
42374 StoredVal = DAG.getBitcast(NewVT, StoredVal);
42375
42376 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
42377 St->getPointerInfo(), St->getAlignment(),
42378 St->getMemOperand()->getFlags());
42379 }
42380
42381 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
42382 // This will avoid a copy to k-register.
42383 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
42384 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42385 StoredVal.getOperand(0).getValueType() == MVT::i8) {
42386 return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
42387 St->getBasePtr(), St->getPointerInfo(),
42388 St->getAlignment(), St->getMemOperand()->getFlags());
42389 }
42390
42391 // Widen v2i1/v4i1 stores to v8i1.
42392 if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
42393 Subtarget.hasAVX512()) {
42394 unsigned NumConcats = 8 / VT.getVectorNumElements();
42395 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
42396 Ops[0] = StoredVal;
42397 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
42398 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
42399 St->getPointerInfo(), St->getAlignment(),
42400 St->getMemOperand()->getFlags());
42401 }
42402
42403 // Turn vXi1 stores of constants into a scalar store.
42404 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
42405 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
42406 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
42407 // If its a v64i1 store without 64-bit support, we need two stores.
42408 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
42409 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
42410 StoredVal->ops().slice(0, 32));
42411 Lo = combinevXi1ConstantToInteger(Lo, DAG);
42412 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
42413 StoredVal->ops().slice(32, 32));
42414 Hi = combinevXi1ConstantToInteger(Hi, DAG);
42415
42416 SDValue Ptr0 = St->getBasePtr();
42417 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
42418
42419 SDValue Ch0 =
42420 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
42421 Alignment, St->getMemOperand()->getFlags());
42422 SDValue Ch1 =
42423 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
42424 St->getPointerInfo().getWithOffset(4),
42425 MinAlign(Alignment, 4U),
42426 St->getMemOperand()->getFlags());
42427 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
42428 }
42429
42430 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
42431 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
42432 St->getPointerInfo(), St->getAlignment(),
42433 St->getMemOperand()->getFlags());
42434 }
42435
42436 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
42437 // Sandy Bridge, perform two 16-byte stores.
42438 bool Fast;
42439 if (VT.is256BitVector() && StVT == VT &&
42440 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
42441 *St->getMemOperand(), &Fast) &&
42442 !Fast) {
42443 unsigned NumElems = VT.getVectorNumElements();
42444 if (NumElems < 2)
42445 return SDValue();
42446
42447 return splitVectorStore(St, DAG);
42448 }
42449
42450 // Split under-aligned vector non-temporal stores.
42451 if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
42452 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
42453 // vectors or the legalizer can scalarize it to use MOVNTI.
42454 if (VT.is256BitVector() || VT.is512BitVector()) {
42455 unsigned NumElems = VT.getVectorNumElements();
42456 if (NumElems < 2)
42457 return SDValue();
42458 return splitVectorStore(St, DAG);
42459 }
42460
42461 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
42462 // to use MOVNTI.
42463 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
42464 MVT NTVT = Subtarget.hasSSE4A()
42465 ? MVT::v2f64
42466 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
42467 return scalarizeVectorStore(St, NTVT, DAG);
42468 }
42469 }
42470
42471 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
42472 // supported, but avx512f is by extending to v16i32 and truncating.
42473 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
42474 St->getValue().getOpcode() == ISD::TRUNCATE &&
42475 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
42476 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
42477 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
42478 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
42479 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
42480 MVT::v16i8, St->getMemOperand());
42481 }
42482
42483 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
42484 if (!St->isTruncatingStore() && StoredVal.hasOneUse() &&
42485 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
42486 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
42487 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
42488 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
42489 return EmitTruncSStore(IsSigned, St->getChain(),
42490 dl, StoredVal.getOperand(0), St->getBasePtr(),
42491 VT, St->getMemOperand(), DAG);
42492 }
42493
42494 // Optimize trunc store (of multiple scalars) to shuffle and store.
42495 // First, pack all of the elements in one place. Next, store to memory
42496 // in fewer chunks.
42497 if (St->isTruncatingStore() && VT.isVector()) {
42498 // Check if we can detect an AVG pattern from the truncation. If yes,
42499 // replace the trunc store by a normal store with the result of X86ISD::AVG
42500 // instruction.
42501 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
42502 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
42503 Subtarget, dl))
42504 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
42505 St->getPointerInfo(), St->getAlignment(),
42506 St->getMemOperand()->getFlags());
42507
42508 if (TLI.isTruncStoreLegal(VT, StVT)) {
42509 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
42510 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
42511 dl, Val, St->getBasePtr(),
42512 St->getMemoryVT(), St->getMemOperand(), DAG);
42513 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
42514 DAG, dl))
42515 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
42516 dl, Val, St->getBasePtr(),
42517 St->getMemoryVT(), St->getMemOperand(), DAG);
42518 }
42519
42520 return SDValue();
42521 }
42522
42523 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
42524 // the FP state in cases where an emms may be missing.
42525 // A preferable solution to the general problem is to figure out the right
42526 // places to insert EMMS. This qualifies as a quick hack.
42527
42528 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
42529 if (VT.getSizeInBits() != 64)
42530 return SDValue();
42531
42532 const Function &F = DAG.getMachineFunction().getFunction();
42533 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
42534 bool F64IsLegal =
42535 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
42536 if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&
42537 isa<LoadSDNode>(St->getValue()) &&
42538 cast<LoadSDNode>(St->getValue())->isSimple() &&
42539 St->getChain().hasOneUse() && St->isSimple()) {
42540 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
42541
42542 if (!ISD::isNormalLoad(Ld))
42543 return SDValue();
42544
42545 // Avoid the transformation if there are multiple uses of the loaded value.
42546 if (!Ld->hasNUsesOfValue(1, 0))
42547 return SDValue();
42548
42549 SDLoc LdDL(Ld);
42550 SDLoc StDL(N);
42551 // Lower to a single movq load/store pair.
42552 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
42553 Ld->getBasePtr(), Ld->getMemOperand());
42554
42555 // Make sure new load is placed in same chain order.
42556 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
42557 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
42558 St->getMemOperand());
42559 }
42560
42561 // This is similar to the above case, but here we handle a scalar 64-bit
42562 // integer store that is extracted from a vector on a 32-bit target.
42563 // If we have SSE2, then we can treat it like a floating-point double
42564 // to get past legalization. The execution dependencies fixup pass will
42565 // choose the optimal machine instruction for the store if this really is
42566 // an integer or v2f32 rather than an f64.
42567 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
42568 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
42569 SDValue OldExtract = St->getOperand(1);
42570 SDValue ExtOp0 = OldExtract.getOperand(0);
42571 unsigned VecSize = ExtOp0.getValueSizeInBits();
42572 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
42573 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
42574 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
42575 BitCast, OldExtract.getOperand(1));
42576 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
42577 St->getPointerInfo(), St->getAlignment(),
42578 St->getMemOperand()->getFlags());
42579 }
42580
42581 return SDValue();
42582}
42583
42584static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
42585 TargetLowering::DAGCombinerInfo &DCI,
42586 const X86Subtarget &Subtarget) {
42587 auto *St = cast<MemIntrinsicSDNode>(N);
42588
42589 SDValue StoredVal = N->getOperand(1);
42590 MVT VT = StoredVal.getSimpleValueType();
42591 EVT MemVT = St->getMemoryVT();
42592
42593 // Figure out which elements we demand.
42594 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
42595 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
42596
42597 APInt KnownUndef, KnownZero;
42598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42599 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
42600 KnownZero, DCI)) {
42601 if (N->getOpcode() != ISD::DELETED_NODE)
42602 DCI.AddToWorklist(N);
42603 return SDValue(N, 0);
42604 }
42605
42606 return SDValue();
42607}
42608
42609/// Return 'true' if this vector operation is "horizontal"
42610/// and return the operands for the horizontal operation in LHS and RHS. A
42611/// horizontal operation performs the binary operation on successive elements
42612/// of its first operand, then on successive elements of its second operand,
42613/// returning the resulting values in a vector. For example, if
42614/// A = < float a0, float a1, float a2, float a3 >
42615/// and
42616/// B = < float b0, float b1, float b2, float b3 >
42617/// then the result of doing a horizontal operation on A and B is
42618/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
42619/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
42620/// A horizontal-op B, for some already available A and B, and if so then LHS is
42621/// set to A, RHS to B, and the routine returns 'true'.
42622static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
42623 const X86Subtarget &Subtarget,
42624 bool IsCommutative) {
42625 // If either operand is undef, bail out. The binop should be simplified.
42626 if (LHS.isUndef() || RHS.isUndef())
42627 return false;
42628
42629 // Look for the following pattern:
42630 // A = < float a0, float a1, float a2, float a3 >
42631 // B = < float b0, float b1, float b2, float b3 >
42632 // and
42633 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
42634 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
42635 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
42636 // which is A horizontal-op B.
42637
42638 MVT VT = LHS.getSimpleValueType();
42639 assert((VT.is128BitVector() || VT.is256BitVector()) &&(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42640, __PRETTY_FUNCTION__))
42640 "Unsupported vector type for horizontal add/sub")(((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"
) ? static_cast<void> (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42640, __PRETTY_FUNCTION__))
;
42641 unsigned NumElts = VT.getVectorNumElements();
42642
42643 // TODO - can we make a general helper method that does all of this for us?
42644 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
42645 SmallVectorImpl<int> &ShuffleMask) {
42646 if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
42647 if (!Op.getOperand(0).isUndef())
42648 N0 = Op.getOperand(0);
42649 if (!Op.getOperand(1).isUndef())
42650 N1 = Op.getOperand(1);
42651 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
42652 ShuffleMask.append(Mask.begin(), Mask.end());
42653 return;
42654 }
42655 bool UseSubVector = false;
42656 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
42657 Op.getOperand(0).getValueType().is256BitVector() &&
42658 llvm::isNullConstant(Op.getOperand(1))) {
42659 Op = Op.getOperand(0);
42660 UseSubVector = true;
42661 }
42662 bool IsUnary;
42663 SmallVector<SDValue, 2> SrcOps;
42664 SmallVector<int, 16> SrcShuffleMask;
42665 SDValue BC = peekThroughBitcasts(Op);
42666 if (isTargetShuffle(BC.getOpcode()) &&
42667 getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
42668 SrcOps, SrcShuffleMask, IsUnary)) {
42669 if (!UseSubVector && SrcShuffleMask.size() == NumElts &&
42670 SrcOps.size() <= 2) {
42671 N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
42672 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
42673 ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
42674 }
42675 if (UseSubVector && (SrcShuffleMask.size() == (NumElts * 2)) &&
42676 SrcOps.size() == 1) {
42677 N0 = extract128BitVector(SrcOps[0], 0, DAG, SDLoc(Op));
42678 N1 = extract128BitVector(SrcOps[0], NumElts, DAG, SDLoc(Op));
42679 ArrayRef<int> Mask = ArrayRef<int>(SrcShuffleMask).slice(0, NumElts);
42680 ShuffleMask.append(Mask.begin(), Mask.end());
42681 }
42682 }
42683 };
42684
42685 // View LHS in the form
42686 // LHS = VECTOR_SHUFFLE A, B, LMask
42687 // If LHS is not a shuffle, then pretend it is the identity shuffle:
42688 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
42689 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
42690 SDValue A, B;
42691 SmallVector<int, 16> LMask;
42692 GetShuffle(LHS, A, B, LMask);
42693
42694 // Likewise, view RHS in the form
42695 // RHS = VECTOR_SHUFFLE C, D, RMask
42696 SDValue C, D;
42697 SmallVector<int, 16> RMask;
42698 GetShuffle(RHS, C, D, RMask);
42699
42700 // At least one of the operands should be a vector shuffle.
42701 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
42702 if (NumShuffles == 0)
42703 return false;
42704
42705 if (LMask.empty()) {
42706 A = LHS;
42707 for (unsigned i = 0; i != NumElts; ++i)
42708 LMask.push_back(i);
42709 }
42710
42711 if (RMask.empty()) {
42712 C = RHS;
42713 for (unsigned i = 0; i != NumElts; ++i)
42714 RMask.push_back(i);
42715 }
42716
42717 // If A and B occur in reverse order in RHS, then canonicalize by commuting
42718 // RHS operands and shuffle mask.
42719 if (A != C) {
42720 std::swap(C, D);
42721 ShuffleVectorSDNode::commuteMask(RMask);
42722 }
42723 // Check that the shuffles are both shuffling the same vectors.
42724 if (!(A == C && B == D))
42725 return false;
42726
42727 // LHS and RHS are now:
42728 // LHS = shuffle A, B, LMask
42729 // RHS = shuffle A, B, RMask
42730 // Check that the masks correspond to performing a horizontal operation.
42731 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
42732 // so we just repeat the inner loop if this is a 256-bit op.
42733 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
42734 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
42735 assert((NumEltsPer128BitChunk % 2 == 0) &&(((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42736, __PRETTY_FUNCTION__))
42736 "Vector type should have an even number of elements in each lane")(((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"
) ? static_cast<void> (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42736, __PRETTY_FUNCTION__))
;
42737 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
42738 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
42739 // Ignore undefined components.
42740 int LIdx = LMask[i + j], RIdx = RMask[i + j];
42741 if (LIdx < 0 || RIdx < 0 ||
42742 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
42743 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
42744 continue;
42745
42746 // The low half of the 128-bit result must choose from A.
42747 // The high half of the 128-bit result must choose from B,
42748 // unless B is undef. In that case, we are always choosing from A.
42749 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
42750 unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
42751
42752 // Check that successive elements are being operated on. If not, this is
42753 // not a horizontal operation.
42754 int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
42755 if (!(LIdx == Index && RIdx == Index + 1) &&
42756 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
42757 return false;
42758 }
42759 }
42760
42761 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
42762 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
42763
42764 if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
42765 return false;
42766
42767 LHS = DAG.getBitcast(VT, LHS);
42768 RHS = DAG.getBitcast(VT, RHS);
42769 return true;
42770}
42771
42772/// Do target-specific dag combines on floating-point adds/subs.
42773static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
42774 const X86Subtarget &Subtarget) {
42775 EVT VT = N->getValueType(0);
42776 SDValue LHS = N->getOperand(0);
42777 SDValue RHS = N->getOperand(1);
42778 bool IsFadd = N->getOpcode() == ISD::FADD;
42779 auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
42780 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode")(((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("(IsFadd || N->getOpcode() == ISD::FSUB) && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42780, __PRETTY_FUNCTION__))
;
42781
42782 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
42783 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
42784 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
42785 isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
42786 return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
42787
42788 return SDValue();
42789}
42790
42791/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
42792/// the codegen.
42793/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
42794/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
42795/// anything that is guaranteed to be transformed by DAGCombiner.
42796static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
42797 const X86Subtarget &Subtarget,
42798 const SDLoc &DL) {
42799 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")((N->getOpcode() == ISD::TRUNCATE && "Wrong opcode"
) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42799, __PRETTY_FUNCTION__))
;
42800 SDValue Src = N->getOperand(0);
42801 unsigned SrcOpcode = Src.getOpcode();
42802 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42803
42804 EVT VT = N->getValueType(0);
42805 EVT SrcVT = Src.getValueType();
42806
42807 auto IsFreeTruncation = [VT](SDValue Op) {
42808 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
42809
42810 // See if this has been extended from a smaller/equal size to
42811 // the truncation size, allowing a truncation to combine with the extend.
42812 unsigned Opcode = Op.getOpcode();
42813 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
42814 Opcode == ISD::ZERO_EXTEND) &&
42815 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
42816 return true;
42817
42818 // See if this is a single use constant which can be constant folded.
42819 // NOTE: We don't peek throught bitcasts here because there is currently
42820 // no support for constant folding truncate+bitcast+vector_of_constants. So
42821 // we'll just send up with a truncate on both operands which will
42822 // get turned back into (truncate (binop)) causing an infinite loop.
42823 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
42824 };
42825
42826 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
42827 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
42828 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
42829 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
42830 };
42831
42832 // Don't combine if the operation has other uses.
42833 if (!Src.hasOneUse())
42834 return SDValue();
42835
42836 // Only support vector truncation for now.
42837 // TODO: i64 scalar math would benefit as well.
42838 if (!VT.isVector())
42839 return SDValue();
42840
42841 // In most cases its only worth pre-truncating if we're only facing the cost
42842 // of one truncation.
42843 // i.e. if one of the inputs will constant fold or the input is repeated.
42844 switch (SrcOpcode) {
42845 case ISD::AND:
42846 case ISD::XOR:
42847 case ISD::OR: {
42848 SDValue Op0 = Src.getOperand(0);
42849 SDValue Op1 = Src.getOperand(1);
42850 if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
42851 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
42852 return TruncateArithmetic(Op0, Op1);
42853 break;
42854 }
42855
42856 case ISD::MUL:
42857 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
42858 // better to truncate if we have the chance.
42859 if (SrcVT.getScalarType() == MVT::i64 &&
42860 TLI.isOperationLegal(SrcOpcode, VT) &&
42861 !TLI.isOperationLegal(SrcOpcode, SrcVT))
42862 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
42863 LLVM_FALLTHROUGH[[gnu::fallthrough]];
42864 case ISD::ADD: {
42865 SDValue Op0 = Src.getOperand(0);
42866 SDValue Op1 = Src.getOperand(1);
42867 if (TLI.isOperationLegal(SrcOpcode, VT) &&
42868 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
42869 return TruncateArithmetic(Op0, Op1);
42870 break;
42871 }
42872 case ISD::SUB: {
42873 // TODO: ISD::SUB We are conservative and require both sides to be freely
42874 // truncatable to avoid interfering with combineSubToSubus.
42875 SDValue Op0 = Src.getOperand(0);
42876 SDValue Op1 = Src.getOperand(1);
42877 if (TLI.isOperationLegal(SrcOpcode, VT) &&
42878 (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
42879 return TruncateArithmetic(Op0, Op1);
42880 break;
42881 }
42882 }
42883
42884 return SDValue();
42885}
42886
42887/// Truncate using ISD::AND mask and X86ISD::PACKUS.
42888/// e.g. trunc <8 x i32> X to <8 x i16> -->
42889/// MaskX = X & 0xffff (clear high bits to prevent saturation)
42890/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
42891static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
42892 const X86Subtarget &Subtarget,
42893 SelectionDAG &DAG) {
42894 SDValue In = N->getOperand(0);
42895 EVT InVT = In.getValueType();
42896 EVT OutVT = N->getValueType(0);
42897
42898 APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),
42899 OutVT.getScalarSizeInBits());
42900 In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));
42901 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
42902}
42903
42904/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
42905static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
42906 const X86Subtarget &Subtarget,
42907 SelectionDAG &DAG) {
42908 SDValue In = N->getOperand(0);
42909 EVT InVT = In.getValueType();
42910 EVT OutVT = N->getValueType(0);
42911 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
42912 DAG.getValueType(OutVT));
42913 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
42914}
42915
42916/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
42917/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
42918/// legalization the truncation will be translated into a BUILD_VECTOR with each
42919/// element that is extracted from a vector and then truncated, and it is
42920/// difficult to do this optimization based on them.
42921static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
42922 const X86Subtarget &Subtarget) {
42923 EVT OutVT = N->getValueType(0);
42924 if (!OutVT.isVector())
42925 return SDValue();
42926
42927 SDValue In = N->getOperand(0);
42928 if (!In.getValueType().isSimple())
42929 return SDValue();
42930
42931 EVT InVT = In.getValueType();
42932 unsigned NumElems = OutVT.getVectorNumElements();
42933
42934 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
42935 // SSE2, and we need to take care of it specially.
42936 // AVX512 provides vpmovdb.
42937 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
42938 return SDValue();
42939
42940 EVT OutSVT = OutVT.getVectorElementType();
42941 EVT InSVT = InVT.getVectorElementType();
42942 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
42943 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
42944 NumElems >= 8))
42945 return SDValue();
42946
42947 // SSSE3's pshufb results in less instructions in the cases below.
42948 if (Subtarget.hasSSSE3() && NumElems == 8 &&
42949 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
42950 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
42951 return SDValue();
42952
42953 SDLoc DL(N);
42954 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
42955 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
42956 // truncate 2 x v4i32 to v8i16.
42957 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
42958 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
42959 if (InSVT == MVT::i32)
42960 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
42961
42962 return SDValue();
42963}
42964
42965/// This function transforms vector truncation of 'extended sign-bits' or
42966/// 'extended zero-bits' values.
42967/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
42968static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
42969 SelectionDAG &DAG,
42970 const X86Subtarget &Subtarget) {
42971 // Requires SSE2.
42972 if (!Subtarget.hasSSE2())
42973 return SDValue();
42974
42975 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
42976 return SDValue();
42977
42978 SDValue In = N->getOperand(0);
42979 if (!In.getValueType().isSimple())
42980 return SDValue();
42981
42982 MVT VT = N->getValueType(0).getSimpleVT();
42983 MVT SVT = VT.getScalarType();
42984
42985 MVT InVT = In.getValueType().getSimpleVT();
42986 MVT InSVT = InVT.getScalarType();
42987
42988 // Check we have a truncation suited for PACKSS/PACKUS.
42989 if (!VT.is128BitVector() && !VT.is256BitVector())
42990 return SDValue();
42991 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
42992 return SDValue();
42993 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
42994 return SDValue();
42995
42996 // AVX512 has fast truncate, but if the input is already going to be split,
42997 // there's no harm in trying pack.
42998 if (Subtarget.hasAVX512() &&
42999 !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
43000 InVT.is512BitVector()))
43001 return SDValue();
43002
43003 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
43004 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
43005
43006 // Use PACKUS if the input has zero-bits that extend all the way to the
43007 // packed/truncated value. e.g. masks, zext_in_reg, etc.
43008 KnownBits Known = DAG.computeKnownBits(In);
43009 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
43010 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
43011 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
43012
43013 // Use PACKSS if the input has sign-bits that extend all the way to the
43014 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
43015 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
43016 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
43017 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
43018
43019 return SDValue();
43020}
43021
43022// Try to form a MULHU or MULHS node by looking for
43023// (trunc (srl (mul ext, ext), 16))
43024// TODO: This is X86 specific because we want to be able to handle wide types
43025// before type legalization. But we can only do it if the vector will be
43026// legalized via widening/splitting. Type legalization can't handle promotion
43027// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
43028// combiner.
43029static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
43030 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
43031 // First instruction should be a right shift of a multiply.
43032 if (Src.getOpcode() != ISD::SRL ||
43033 Src.getOperand(0).getOpcode() != ISD::MUL)
43034 return SDValue();
43035
43036 if (!Subtarget.hasSSE2())
43037 return SDValue();
43038
43039 // Only handle vXi16 types that are at least 128-bits unless they will be
43040 // widened.
43041 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
43042 return SDValue();
43043
43044 // Input type should be vXi32.
43045 EVT InVT = Src.getValueType();
43046 if (InVT.getVectorElementType() != MVT::i32)
43047 return SDValue();
43048
43049 // Need a shift by 16.
43050 APInt ShiftAmt;
43051 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
43052 ShiftAmt != 16)
43053 return SDValue();
43054
43055 SDValue LHS = Src.getOperand(0).getOperand(0);
43056 SDValue RHS = Src.getOperand(0).getOperand(1);
43057
43058 unsigned ExtOpc = LHS.getOpcode();
43059 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
43060 RHS.getOpcode() != ExtOpc)
43061 return SDValue();
43062
43063 // Peek through the extends.
43064 LHS = LHS.getOperand(0);
43065 RHS = RHS.getOperand(0);
43066
43067 // Ensure the input types match.
43068 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
43069 return SDValue();
43070
43071 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
43072 return DAG.getNode(Opc, DL, VT, LHS, RHS);
43073}
43074
43075// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
43076// from one vector with signed bytes from another vector, adds together
43077// adjacent pairs of 16-bit products, and saturates the result before
43078// truncating to 16-bits.
43079//
43080// Which looks something like this:
43081// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
43082// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
43083static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
43084 const X86Subtarget &Subtarget,
43085 const SDLoc &DL) {
43086 if (!VT.isVector() || !Subtarget.hasSSSE3())
43087 return SDValue();
43088
43089 unsigned NumElems = VT.getVectorNumElements();
43090 EVT ScalarVT = VT.getVectorElementType();
43091 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
43092 return SDValue();
43093
43094 SDValue SSatVal = detectSSatPattern(In, VT);
43095 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
43096 return SDValue();
43097
43098 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
43099 // of multiplies from even/odd elements.
43100 SDValue N0 = SSatVal.getOperand(0);
43101 SDValue N1 = SSatVal.getOperand(1);
43102
43103 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
43104 return SDValue();
43105
43106 SDValue N00 = N0.getOperand(0);
43107 SDValue N01 = N0.getOperand(1);
43108 SDValue N10 = N1.getOperand(0);
43109 SDValue N11 = N1.getOperand(1);
43110
43111 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
43112 // Canonicalize zero_extend to LHS.
43113 if (N01.getOpcode() == ISD::ZERO_EXTEND)
43114 std::swap(N00, N01);
43115 if (N11.getOpcode() == ISD::ZERO_EXTEND)
43116 std::swap(N10, N11);
43117
43118 // Ensure we have a zero_extend and a sign_extend.
43119 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
43120 N01.getOpcode() != ISD::SIGN_EXTEND ||
43121 N10.getOpcode() != ISD::ZERO_EXTEND ||
43122 N11.getOpcode() != ISD::SIGN_EXTEND)
43123 return SDValue();
43124
43125 // Peek through the extends.
43126 N00 = N00.getOperand(0);
43127 N01 = N01.getOperand(0);
43128 N10 = N10.getOperand(0);
43129 N11 = N11.getOperand(0);
43130
43131 // Ensure the extend is from vXi8.
43132 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
43133 N01.getValueType().getVectorElementType() != MVT::i8 ||
43134 N10.getValueType().getVectorElementType() != MVT::i8 ||
43135 N11.getValueType().getVectorElementType() != MVT::i8)
43136 return SDValue();
43137
43138 // All inputs should be build_vectors.
43139 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
43140 N01.getOpcode() != ISD::BUILD_VECTOR ||
43141 N10.getOpcode() != ISD::BUILD_VECTOR ||
43142 N11.getOpcode() != ISD::BUILD_VECTOR)
43143 return SDValue();
43144
43145 // N00/N10 are zero extended. N01/N11 are sign extended.
43146
43147 // For each element, we need to ensure we have an odd element from one vector
43148 // multiplied by the odd element of another vector and the even element from
43149 // one of the same vectors being multiplied by the even element from the
43150 // other vector. So we need to make sure for each element i, this operator
43151 // is being performed:
43152 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
43153 SDValue ZExtIn, SExtIn;
43154 for (unsigned i = 0; i != NumElems; ++i) {
43155 SDValue N00Elt = N00.getOperand(i);
43156 SDValue N01Elt = N01.getOperand(i);
43157 SDValue N10Elt = N10.getOperand(i);
43158 SDValue N11Elt = N11.getOperand(i);
43159 // TODO: Be more tolerant to undefs.
43160 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
43161 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
43162 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
43163 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
43164 return SDValue();
43165 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
43166 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
43167 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
43168 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
43169 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
43170 return SDValue();
43171 unsigned IdxN00 = ConstN00Elt->getZExtValue();
43172 unsigned IdxN01 = ConstN01Elt->getZExtValue();
43173 unsigned IdxN10 = ConstN10Elt->getZExtValue();
43174 unsigned IdxN11 = ConstN11Elt->getZExtValue();
43175 // Add is commutative so indices can be reordered.
43176 if (IdxN00 > IdxN10) {
43177 std::swap(IdxN00, IdxN10);
43178 std::swap(IdxN01, IdxN11);
43179 }
43180 // N0 indices be the even element. N1 indices must be the next odd element.
43181 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
43182 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
43183 return SDValue();
43184 SDValue N00In = N00Elt.getOperand(0);
43185 SDValue N01In = N01Elt.getOperand(0);
43186 SDValue N10In = N10Elt.getOperand(0);
43187 SDValue N11In = N11Elt.getOperand(0);
43188 // First time we find an input capture it.
43189 if (!ZExtIn) {
43190 ZExtIn = N00In;
43191 SExtIn = N01In;
43192 }
43193 if (ZExtIn != N00In || SExtIn != N01In ||
43194 ZExtIn != N10In || SExtIn != N11In)
43195 return SDValue();
43196 }
43197
43198 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43199 ArrayRef<SDValue> Ops) {
43200 // Shrink by adding truncate nodes and let DAGCombine fold with the
43201 // sources.
43202 EVT InVT = Ops[0].getValueType();
43203 assert(InVT.getScalarType() == MVT::i8 &&((InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43204, __PRETTY_FUNCTION__))
43204 "Unexpected scalar element type")((InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43204, __PRETTY_FUNCTION__))
;
43205 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((InVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43205, __PRETTY_FUNCTION__))
;
43206 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
43207 InVT.getVectorNumElements() / 2);
43208 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
43209 };
43210 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
43211 PMADDBuilder);
43212}
43213
43214static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
43215 const X86Subtarget &Subtarget) {
43216 EVT VT = N->getValueType(0);
43217 SDValue Src = N->getOperand(0);
43218 SDLoc DL(N);
43219
43220 // Attempt to pre-truncate inputs to arithmetic ops instead.
43221 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
43222 return V;
43223
43224 // Try to detect AVG pattern first.
43225 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
43226 return Avg;
43227
43228 // Try to detect PMADD
43229 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
43230 return PMAdd;
43231
43232 // Try to combine truncation with signed/unsigned saturation.
43233 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
43234 return Val;
43235
43236 // Try to combine PMULHUW/PMULHW for vXi16.
43237 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
43238 return V;
43239
43240 // The bitcast source is a direct mmx result.
43241 // Detect bitcasts between i32 to x86mmx
43242 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
43243 SDValue BCSrc = Src.getOperand(0);
43244 if (BCSrc.getValueType() == MVT::x86mmx)
43245 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
43246 }
43247
43248 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
43249 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
43250 return V;
43251
43252 return combineVectorTruncation(N, DAG, Subtarget);
43253}
43254
43255static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
43256 EVT VT = N->getValueType(0);
43257 SDValue In = N->getOperand(0);
43258 SDLoc DL(N);
43259
43260 if (auto SSatVal = detectSSatPattern(In, VT))
43261 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
43262 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
43263 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
43264
43265 return SDValue();
43266}
43267
43268/// Returns the negated value if the node \p N flips sign of FP value.
43269///
43270/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
43271/// or FSUB(0, x)
43272/// AVX512F does not have FXOR, so FNEG is lowered as
43273/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
43274/// In this case we go though all bitcasts.
43275/// This also recognizes splat of a negated value and returns the splat of that
43276/// value.
43277static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
43278 if (N->getOpcode() == ISD::FNEG)
43279 return N->getOperand(0);
43280
43281 // Don't recurse exponentially.
43282 if (Depth > SelectionDAG::MaxRecursionDepth)
43283 return SDValue();
43284
43285 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
43286
43287 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
43288 EVT VT = Op->getValueType(0);
43289
43290 // Make sure the element size doesn't change.
43291 if (VT.getScalarSizeInBits() != ScalarSize)
43292 return SDValue();
43293
43294 unsigned Opc = Op.getOpcode();
43295 switch (Opc) {
43296 case ISD::VECTOR_SHUFFLE: {
43297 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
43298 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
43299 if (!Op.getOperand(1).isUndef())
43300 return SDValue();
43301 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
43302 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
43303 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
43304 cast<ShuffleVectorSDNode>(Op)->getMask());
43305 break;
43306 }
43307 case ISD::INSERT_VECTOR_ELT: {
43308 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
43309 // -V, INDEX).
43310 SDValue InsVector = Op.getOperand(0);
43311 SDValue InsVal = Op.getOperand(1);
43312 if (!InsVector.isUndef())
43313 return SDValue();
43314 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
43315 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
43316 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
43317 NegInsVal, Op.getOperand(2));
43318 break;
43319 }
43320 case ISD::FSUB:
43321 case ISD::XOR:
43322 case X86ISD::FXOR: {
43323 SDValue Op1 = Op.getOperand(1);
43324 SDValue Op0 = Op.getOperand(0);
43325
43326 // For XOR and FXOR, we want to check if constant
43327 // bits of Op1 are sign bit masks. For FSUB, we
43328 // have to check if constant bits of Op0 are sign
43329 // bit masks and hence we swap the operands.
43330 if (Opc == ISD::FSUB)
43331 std::swap(Op0, Op1);
43332
43333 APInt UndefElts;
43334 SmallVector<APInt, 16> EltBits;
43335 // Extract constant bits and see if they are all
43336 // sign bit masks. Ignore the undef elements.
43337 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
43338 /* AllowWholeUndefs */ true,
43339 /* AllowPartialUndefs */ false)) {
43340 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
43341 if (!UndefElts[I] && !EltBits[I].isSignMask())
43342 return SDValue();
43343
43344 return peekThroughBitcasts(Op0);
43345 }
43346 }
43347 }
43348
43349 return SDValue();
43350}
43351
43352static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
43353 bool NegRes) {
43354 if (NegMul) {
43355 switch (Opcode) {
43356 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43356)
;
43357 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
43358 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
43359 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
43360 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
43361 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
43362 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
43363 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
43364 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
43365 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
43366 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
43367 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
43368 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
43369 }
43370 }
43371
43372 if (NegAcc) {
43373 switch (Opcode) {
43374 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43374)
;
43375 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
43376 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
43377 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
43378 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
43379 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
43380 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
43381 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
43382 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
43383 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
43384 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
43385 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
43386 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
43387 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
43388 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
43389 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
43390 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
43391 }
43392 }
43393
43394 if (NegRes) {
43395 switch (Opcode) {
43396 // For accuracy reason, we never combine fneg and fma under strict FP.
43397 default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43397)
;
43398 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
43399 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
43400 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
43401 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
43402 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
43403 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
43404 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
43405 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
43406 }
43407 }
43408
43409 return Opcode;
43410}
43411
43412/// Do target-specific dag combines on floating point negations.
43413static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
43414 TargetLowering::DAGCombinerInfo &DCI,
43415 const X86Subtarget &Subtarget) {
43416 EVT OrigVT = N->getValueType(0);
43417 SDValue Arg = isFNEG(DAG, N);
43418 if (!Arg)
43419 return SDValue();
43420
43421 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43422 EVT VT = Arg.getValueType();
43423 EVT SVT = VT.getScalarType();
43424 SDLoc DL(N);
43425
43426 // Let legalize expand this if it isn't a legal type yet.
43427 if (!TLI.isTypeLegal(VT))
43428 return SDValue();
43429
43430 // If we're negating a FMUL node on a target with FMA, then we can avoid the
43431 // use of a constant by performing (-0 - A*B) instead.
43432 // FIXME: Check rounding control flags as well once it becomes available.
43433 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
43434 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
43435 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
43436 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
43437 Arg.getOperand(1), Zero);
43438 return DAG.getBitcast(OrigVT, NewNode);
43439 }
43440
43441 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
43442 bool LegalOperations = !DCI.isBeforeLegalizeOps();
43443 if (TLI.getNegatibleCost(Arg, DAG, LegalOperations, CodeSize) !=
43444 TargetLowering::NegatibleCost::Expensive)
43445 return DAG.getBitcast(
43446 OrigVT, TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize));
43447
43448 return SDValue();
43449}
43450
43451TargetLowering::NegatibleCost
43452X86TargetLowering::getNegatibleCost(SDValue Op, SelectionDAG &DAG,
43453 bool LegalOperations, bool ForCodeSize,
43454 unsigned Depth) const {
43455 // fneg patterns are removable even if they have multiple uses.
43456 if (isFNEG(DAG, Op.getNode(), Depth))
43457 return NegatibleCost::Cheaper;
43458
43459 // Don't recurse exponentially.
43460 if (Depth > SelectionDAG::MaxRecursionDepth)
43461 return NegatibleCost::Expensive;
43462
43463 EVT VT = Op.getValueType();
43464 EVT SVT = VT.getScalarType();
43465 switch (Op.getOpcode()) {
43466 case ISD::FMA:
43467 case X86ISD::FMSUB:
43468 case X86ISD::FNMADD:
43469 case X86ISD::FNMSUB:
43470 case X86ISD::FMADD_RND:
43471 case X86ISD::FMSUB_RND:
43472 case X86ISD::FNMADD_RND:
43473 case X86ISD::FNMSUB_RND: {
43474 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
43475 !(SVT == MVT::f32 || SVT == MVT::f64) ||
43476 !isOperationLegal(ISD::FMA, VT))
43477 break;
43478
43479 // This is always negatible for free but we might be able to remove some
43480 // extra operand negations as well.
43481 for (int i = 0; i != 3; ++i) {
43482 NegatibleCost V = getNegatibleCost(Op.getOperand(i), DAG, LegalOperations,
43483 ForCodeSize, Depth + 1);
43484 if (V == NegatibleCost::Cheaper)
43485 return V;
43486 }
43487 return NegatibleCost::Neutral;
43488 }
43489 case X86ISD::FRCP:
43490 return getNegatibleCost(Op.getOperand(0), DAG, LegalOperations, ForCodeSize,
43491 Depth + 1);
43492 }
43493
43494 return TargetLowering::getNegatibleCost(Op, DAG, LegalOperations, ForCodeSize,
43495 Depth);
43496}
43497
43498SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
43499 bool LegalOperations,
43500 bool ForCodeSize,
43501 unsigned Depth) const {
43502 // fneg patterns are removable even if they have multiple uses.
43503 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))
43504 return DAG.getBitcast(Op.getValueType(), Arg);
43505
43506 EVT VT = Op.getValueType();
43507 EVT SVT = VT.getScalarType();
43508 unsigned Opc = Op.getOpcode();
43509 switch (Opc) {
43510 case ISD::FMA:
43511 case X86ISD::FMSUB:
43512 case X86ISD::FNMADD:
43513 case X86ISD::FNMSUB:
43514 case X86ISD::FMADD_RND:
43515 case X86ISD::FMSUB_RND:
43516 case X86ISD::FNMADD_RND:
43517 case X86ISD::FNMSUB_RND: {
43518 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
43519 !(SVT == MVT::f32 || SVT == MVT::f64) ||
43520 !isOperationLegal(ISD::FMA, VT))
43521 break;
43522
43523 // This is always negatible for free but we might be able to remove some
43524 // extra operand negations as well.
43525 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
43526 for (int i = 0; i != 3; ++i) {
43527 NegatibleCost V = getNegatibleCost(Op.getOperand(i), DAG, LegalOperations,
43528 ForCodeSize, Depth + 1);
43529 if (V == NegatibleCost::Cheaper)
43530 NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,
43531 ForCodeSize, Depth + 1);
43532 }
43533
43534 bool NegA = !!NewOps[0];
43535 bool NegB = !!NewOps[1];
43536 bool NegC = !!NewOps[2];
43537 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
43538
43539 // Fill in the non-negated ops with the original values.
43540 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
43541 if (!NewOps[i])
43542 NewOps[i] = Op.getOperand(i);
43543 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
43544 }
43545 case X86ISD::FRCP:
43546 return DAG.getNode(Opc, SDLoc(Op), VT,
43547 getNegatedExpression(Op.getOperand(0), DAG,
43548 LegalOperations, ForCodeSize,
43549 Depth + 1));
43550 }
43551
43552 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
43553 ForCodeSize, Depth);
43554}
43555
43556static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
43557 const X86Subtarget &Subtarget) {
43558 MVT VT = N->getSimpleValueType(0);
43559 // If we have integer vector types available, use the integer opcodes.
43560 if (!VT.isVector() || !Subtarget.hasSSE2())
43561 return SDValue();
43562
43563 SDLoc dl(N);
43564
43565 unsigned IntBits = VT.getScalarSizeInBits();
43566 MVT IntSVT = MVT::getIntegerVT(IntBits);
43567 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
43568
43569 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
43570 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
43571 unsigned IntOpcode;
43572 switch (N->getOpcode()) {
43573 default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43573)
;
43574 case X86ISD::FOR: IntOpcode = ISD::OR; break;
43575 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
43576 case X86ISD::FAND: IntOpcode = ISD::AND; break;
43577 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
43578 }
43579 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
43580 return DAG.getBitcast(VT, IntOp);
43581}
43582
43583
43584/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
43585static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
43586 if (N->getOpcode() != ISD::XOR)
43587 return SDValue();
43588
43589 SDValue LHS = N->getOperand(0);
43590 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
43591 return SDValue();
43592
43593 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
43594 X86::CondCode(LHS->getConstantOperandVal(0)));
43595 SDLoc DL(N);
43596 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
43597}
43598
43599static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
43600 TargetLowering::DAGCombinerInfo &DCI,
43601 const X86Subtarget &Subtarget) {
43602 // If this is SSE1 only convert to FXOR to avoid scalarization.
43603 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
43604 N->getValueType(0) == MVT::v4i32) {
43605 return DAG.getBitcast(
43606 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
43607 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
43608 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
43609 }
43610
43611 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
43612 return Cmp;
43613
43614 if (DCI.isBeforeLegalizeOps())
43615 return SDValue();
43616
43617 if (SDValue SetCC = foldXor1SetCC(N, DAG))
43618 return SetCC;
43619
43620 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
43621 return RV;
43622
43623 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
43624 return FPLogic;
43625
43626 return combineFneg(N, DAG, DCI, Subtarget);
43627}
43628
43629static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
43630 TargetLowering::DAGCombinerInfo &DCI,
43631 const X86Subtarget &Subtarget) {
43632 EVT VT = N->getValueType(0);
43633 unsigned NumBits = VT.getSizeInBits();
43634
43635 // TODO - Constant Folding.
43636
43637 // Simplify the inputs.
43638 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43639 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
43640 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
43641 return SDValue(N, 0);
43642
43643 return SDValue();
43644}
43645
43646static bool isNullFPScalarOrVectorConst(SDValue V) {
43647 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
43648}
43649
43650/// If a value is a scalar FP zero or a vector FP zero (potentially including
43651/// undefined elements), return a zero constant that may be used to fold away
43652/// that value. In the case of a vector, the returned constant will not contain
43653/// undefined elements even if the input parameter does. This makes it suitable
43654/// to be used as a replacement operand with operations (eg, bitwise-and) where
43655/// an undef should not propagate.
43656static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
43657 const X86Subtarget &Subtarget) {
43658 if (!isNullFPScalarOrVectorConst(V))
43659 return SDValue();
43660
43661 if (V.getValueType().isVector())
43662 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
43663
43664 return V;
43665}
43666
43667static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
43668 const X86Subtarget &Subtarget) {
43669 SDValue N0 = N->getOperand(0);
43670 SDValue N1 = N->getOperand(1);
43671 EVT VT = N->getValueType(0);
43672 SDLoc DL(N);
43673
43674 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
43675 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
43676 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
43677 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
43678 return SDValue();
43679
43680 auto isAllOnesConstantFP = [](SDValue V) {
43681 if (V.getSimpleValueType().isVector())
43682 return ISD::isBuildVectorAllOnes(V.getNode());
43683 auto *C = dyn_cast<ConstantFPSDNode>(V);
43684 return C && C->getConstantFPValue()->isAllOnesValue();
43685 };
43686
43687 // fand (fxor X, -1), Y --> fandn X, Y
43688 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
43689 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
43690
43691 // fand X, (fxor Y, -1) --> fandn Y, X
43692 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
43693 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
43694
43695 return SDValue();
43696}
43697
43698/// Do target-specific dag combines on X86ISD::FAND nodes.
43699static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
43700 const X86Subtarget &Subtarget) {
43701 // FAND(0.0, x) -> 0.0
43702 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
43703 return V;
43704
43705 // FAND(x, 0.0) -> 0.0
43706 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
43707 return V;
43708
43709 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
43710 return V;
43711
43712 return lowerX86FPLogicOp(N, DAG, Subtarget);
43713}
43714
43715/// Do target-specific dag combines on X86ISD::FANDN nodes.
43716static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
43717 const X86Subtarget &Subtarget) {
43718 // FANDN(0.0, x) -> x
43719 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
43720 return N->getOperand(1);
43721
43722 // FANDN(x, 0.0) -> 0.0
43723 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
43724 return V;
43725
43726 return lowerX86FPLogicOp(N, DAG, Subtarget);
43727}
43728
43729/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
43730static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
43731 TargetLowering::DAGCombinerInfo &DCI,
43732 const X86Subtarget &Subtarget) {
43733 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)((N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD
::FXOR) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43733, __PRETTY_FUNCTION__))
;
43734
43735 // F[X]OR(0.0, x) -> x
43736 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
43737 return N->getOperand(1);
43738
43739 // F[X]OR(x, 0.0) -> x
43740 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
43741 return N->getOperand(0);
43742
43743 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
43744 return NewVal;
43745
43746 return lowerX86FPLogicOp(N, DAG, Subtarget);
43747}
43748
43749/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
43750static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
43751 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)((N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD
::FMAX) ? static_cast<void> (0) : __assert_fail ("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43751, __PRETTY_FUNCTION__))
;
43752
43753 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
43754 if (!DAG.getTarget().Options.NoNaNsFPMath ||
43755 !DAG.getTarget().Options.NoSignedZerosFPMath)
43756 return SDValue();
43757
43758 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
43759 // into FMINC and FMAXC, which are Commutative operations.
43760 unsigned NewOp = 0;
43761 switch (N->getOpcode()) {
43762 default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43762)
;
43763 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
43764 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
43765 }
43766
43767 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
43768 N->getOperand(0), N->getOperand(1));
43769}
43770
43771static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
43772 const X86Subtarget &Subtarget) {
43773 if (Subtarget.useSoftFloat())
43774 return SDValue();
43775
43776 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43777
43778 EVT VT = N->getValueType(0);
43779 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
43780 (Subtarget.hasSSE2() && VT == MVT::f64) ||
43781 (VT.isVector() && TLI.isTypeLegal(VT))))
43782 return SDValue();
43783
43784 SDValue Op0 = N->getOperand(0);
43785 SDValue Op1 = N->getOperand(1);
43786 SDLoc DL(N);
43787 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
43788
43789 // If we don't have to respect NaN inputs, this is a direct translation to x86
43790 // min/max instructions.
43791 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
43792 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
43793
43794 // If one of the operands is known non-NaN use the native min/max instructions
43795 // with the non-NaN input as second operand.
43796 if (DAG.isKnownNeverNaN(Op1))
43797 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
43798 if (DAG.isKnownNeverNaN(Op0))
43799 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
43800
43801 // If we have to respect NaN inputs, this takes at least 3 instructions.
43802 // Favor a library call when operating on a scalar and minimizing code size.
43803 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
43804 return SDValue();
43805
43806 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
43807 VT);
43808
43809 // There are 4 possibilities involving NaN inputs, and these are the required
43810 // outputs:
43811 // Op1
43812 // Num NaN
43813 // ----------------
43814 // Num | Max | Op0 |
43815 // Op0 ----------------
43816 // NaN | Op1 | NaN |
43817 // ----------------
43818 //
43819 // The SSE FP max/min instructions were not designed for this case, but rather
43820 // to implement:
43821 // Min = Op1 < Op0 ? Op1 : Op0
43822 // Max = Op1 > Op0 ? Op1 : Op0
43823 //
43824 // So they always return Op0 if either input is a NaN. However, we can still
43825 // use those instructions for fmaxnum by selecting away a NaN input.
43826
43827 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
43828 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
43829 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
43830
43831 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
43832 // are NaN, the NaN value of Op1 is the result.
43833 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
43834}
43835
43836static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
43837 TargetLowering::DAGCombinerInfo &DCI) {
43838 EVT VT = N->getValueType(0);
43839 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43840
43841 APInt KnownUndef, KnownZero;
43842 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
43843 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
43844 KnownZero, DCI))
43845 return SDValue(N, 0);
43846
43847 // Convert a full vector load into vzload when not all bits are needed.
43848 SDValue In = N->getOperand(0);
43849 MVT InVT = In.getSimpleValueType();
43850 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
43851 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
43852 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((InVT.is128BitVector() && "Expected 128-bit input vector"
) ? static_cast<void> (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43852, __PRETTY_FUNCTION__))
;
43853 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
43854 // Unless the load is volatile or atomic.
43855 if (LN->isSimple()) {
43856 SDLoc dl(N);
43857 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
43858 MVT MemVT = MVT::getIntegerVT(NumBits);
43859 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
43860 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43861 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
43862 SDValue VZLoad =
43863 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
43864 LN->getPointerInfo(),
43865 LN->getAlignment(),
43866 LN->getMemOperand()->getFlags());
43867 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
43868 DAG.getBitcast(InVT, VZLoad));
43869 DCI.CombineTo(N, Convert);
43870 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
43871 return SDValue(N, 0);
43872 }
43873 }
43874
43875 return SDValue();
43876}
43877
43878static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
43879 TargetLowering::DAGCombinerInfo &DCI) {
43880 // FIXME: Handle strict fp nodes.
43881 EVT VT = N->getValueType(0);
43882
43883 // Convert a full vector load into vzload when not all bits are needed.
43884 SDValue In = N->getOperand(0);
43885 MVT InVT = In.getSimpleValueType();
43886 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
43887 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
43888 assert(InVT.is128BitVector() && "Expected 128-bit input vector")((InVT.is128BitVector() && "Expected 128-bit input vector"
) ? static_cast<void> (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43888, __PRETTY_FUNCTION__))
;
43889 LoadSDNode *LN = cast<LoadSDNode>(In);
43890 // Unless the load is volatile or atomic.
43891 if (LN->isSimple()) {
43892 SDLoc dl(N);
43893 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
43894 MVT MemVT = MVT::getFloatingPointVT(NumBits);
43895 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
43896 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43897 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
43898 SDValue VZLoad =
43899 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
43900 LN->getPointerInfo(),
43901 LN->getAlignment(),
43902 LN->getMemOperand()->getFlags());
43903 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
43904 DAG.getBitcast(InVT, VZLoad));
43905 DCI.CombineTo(N, Convert);
43906 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
43907 return SDValue(N, 0);
43908 }
43909 }
43910
43911 return SDValue();
43912}
43913
43914/// Do target-specific dag combines on X86ISD::ANDNP nodes.
43915static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
43916 TargetLowering::DAGCombinerInfo &DCI,
43917 const X86Subtarget &Subtarget) {
43918 MVT VT = N->getSimpleValueType(0);
43919
43920 // ANDNP(0, x) -> x
43921 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
43922 return N->getOperand(1);
43923
43924 // ANDNP(x, 0) -> 0
43925 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
43926 return DAG.getConstant(0, SDLoc(N), VT);
43927
43928 // Turn ANDNP back to AND if input is inverted.
43929 if (SDValue Not = IsNOT(N->getOperand(0), DAG))
43930 return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not),
43931 N->getOperand(1));
43932
43933 // Attempt to recursively combine a bitmask ANDNP with shuffles.
43934 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
43935 SDValue Op(N, 0);
43936 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43937 return Res;
43938 }
43939
43940 return SDValue();
43941}
43942
43943static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
43944 TargetLowering::DAGCombinerInfo &DCI) {
43945 SDValue N1 = N->getOperand(1);
43946
43947 // BT ignores high bits in the bit index operand.
43948 unsigned BitWidth = N1.getValueSizeInBits();
43949 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
43950 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
43951 if (N->getOpcode() != ISD::DELETED_NODE)
43952 DCI.AddToWorklist(N);
43953 return SDValue(N, 0);
43954 }
43955
43956 return SDValue();
43957}
43958
43959static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
43960 TargetLowering::DAGCombinerInfo &DCI) {
43961 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
43962 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
43963
43964 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
43965 APInt KnownUndef, KnownZero;
43966 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43967 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
43968 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
43969 DCI)) {
43970 if (N->getOpcode() != ISD::DELETED_NODE)
43971 DCI.AddToWorklist(N);
43972 return SDValue(N, 0);
43973 }
43974
43975 // FIXME: Shrink vector loads.
43976 if (IsStrict)
43977 return SDValue();
43978
43979 // Convert a full vector load into vzload when not all bits are needed.
43980 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
43981 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
43982 // Unless the load is volatile or atomic.
43983 if (LN->isSimple()) {
43984 SDLoc dl(N);
43985 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
43986 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
43987 SDValue VZLoad =
43988 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64,
43989 LN->getPointerInfo(),
43990 LN->getAlignment(),
43991 LN->getMemOperand()->getFlags());
43992 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
43993 DAG.getBitcast(MVT::v8i16, VZLoad));
43994 DCI.CombineTo(N, Convert);
43995 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
43996 return SDValue(N, 0);
43997 }
43998 }
43999 }
44000
44001 return SDValue();
44002}
44003
44004// Try to combine sext_in_reg of a cmov of constants by extending the constants.
44005static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
44006 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((N->getOpcode() == ISD::SIGN_EXTEND_INREG) ? static_cast<
void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44006, __PRETTY_FUNCTION__))
;
44007
44008 EVT DstVT = N->getValueType(0);
44009
44010 SDValue N0 = N->getOperand(0);
44011 SDValue N1 = N->getOperand(1);
44012 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
44013
44014 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
44015 return SDValue();
44016
44017 // Look through single use any_extends / truncs.
44018 SDValue IntermediateBitwidthOp;
44019 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
44020 N0.hasOneUse()) {
44021 IntermediateBitwidthOp = N0;
44022 N0 = N0.getOperand(0);
44023 }
44024
44025 // See if we have a single use cmov.
44026 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
44027 return SDValue();
44028
44029 SDValue CMovOp0 = N0.getOperand(0);
44030 SDValue CMovOp1 = N0.getOperand(1);
44031
44032 // Make sure both operands are constants.
44033 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
44034 !isa<ConstantSDNode>(CMovOp1.getNode()))
44035 return SDValue();
44036
44037 SDLoc DL(N);
44038
44039 // If we looked through an any_extend/trunc above, add one to the constants.
44040 if (IntermediateBitwidthOp) {
44041 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
44042 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
44043 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
44044 }
44045
44046 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
44047 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
44048
44049 EVT CMovVT = DstVT;
44050 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
44051 if (DstVT == MVT::i16) {
44052 CMovVT = MVT::i32;
44053 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
44054 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
44055 }
44056
44057 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
44058 N0.getOperand(2), N0.getOperand(3));
44059
44060 if (CMovVT != DstVT)
44061 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
44062
44063 return CMov;
44064}
44065
44066static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
44067 const X86Subtarget &Subtarget) {
44068 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)((N->getOpcode() == ISD::SIGN_EXTEND_INREG) ? static_cast<
void> (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44068, __PRETTY_FUNCTION__))
;
44069
44070 if (SDValue V = combineSextInRegCmov(N, DAG))
44071 return V;
44072
44073 EVT VT = N->getValueType(0);
44074 SDValue N0 = N->getOperand(0);
44075 SDValue N1 = N->getOperand(1);
44076 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
44077 SDLoc dl(N);
44078
44079 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
44080 // both SSE and AVX2 since there is no sign-extended shift right
44081 // operation on a vector with 64-bit elements.
44082 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
44083 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
44084 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
44085 N0.getOpcode() == ISD::SIGN_EXTEND)) {
44086 SDValue N00 = N0.getOperand(0);
44087
44088 // EXTLOAD has a better solution on AVX2,
44089 // it may be replaced with X86ISD::VSEXT node.
44090 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
44091 if (!ISD::isNormalLoad(N00.getNode()))
44092 return SDValue();
44093
44094 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
44095 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
44096 N00, N1);
44097 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
44098 }
44099 }
44100 return SDValue();
44101}
44102
44103/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
44104/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
44105/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
44106/// opportunities to combine math ops, use an LEA, or use a complex addressing
44107/// mode. This can eliminate extend, add, and shift instructions.
44108static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
44109 const X86Subtarget &Subtarget) {
44110 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
44111 Ext->getOpcode() != ISD::ZERO_EXTEND)
44112 return SDValue();
44113
44114 // TODO: This should be valid for other integer types.
44115 EVT VT = Ext->getValueType(0);
44116 if (VT != MVT::i64)
44117 return SDValue();
44118
44119 SDValue Add = Ext->getOperand(0);
44120 if (Add.getOpcode() != ISD::ADD)
44121 return SDValue();
44122
44123 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
44124 bool NSW = Add->getFlags().hasNoSignedWrap();
44125 bool NUW = Add->getFlags().hasNoUnsignedWrap();
44126
44127 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
44128 // into the 'zext'
44129 if ((Sext && !NSW) || (!Sext && !NUW))
44130 return SDValue();
44131
44132 // Having a constant operand to the 'add' ensures that we are not increasing
44133 // the instruction count because the constant is extended for free below.
44134 // A constant operand can also become the displacement field of an LEA.
44135 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
44136 if (!AddOp1)
44137 return SDValue();
44138
44139 // Don't make the 'add' bigger if there's no hope of combining it with some
44140 // other 'add' or 'shl' instruction.
44141 // TODO: It may be profitable to generate simpler LEA instructions in place
44142 // of single 'add' instructions, but the cost model for selecting an LEA
44143 // currently has a high threshold.
44144 bool HasLEAPotential = false;
44145 for (auto *User : Ext->uses()) {
44146 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
44147 HasLEAPotential = true;
44148 break;
44149 }
44150 }
44151 if (!HasLEAPotential)
44152 return SDValue();
44153
44154 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
44155 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
44156 SDValue AddOp0 = Add.getOperand(0);
44157 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
44158 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
44159
44160 // The wider add is guaranteed to not wrap because both operands are
44161 // sign-extended.
44162 SDNodeFlags Flags;
44163 Flags.setNoSignedWrap(NSW);
44164 Flags.setNoUnsignedWrap(NUW);
44165 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
44166}
44167
44168// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
44169// operands and the result of CMOV is not used anywhere else - promote CMOV
44170// itself instead of promoting its result. This could be beneficial, because:
44171// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
44172// (or more) pseudo-CMOVs only when they go one-after-another and
44173// getting rid of result extension code after CMOV will help that.
44174// 2) Promotion of constant CMOV arguments is free, hence the
44175// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
44176// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
44177// promotion is also good in terms of code-size.
44178// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
44179// promotion).
44180static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
44181 SDValue CMovN = Extend->getOperand(0);
44182 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
44183 return SDValue();
44184
44185 EVT TargetVT = Extend->getValueType(0);
44186 unsigned ExtendOpcode = Extend->getOpcode();
44187 SDLoc DL(Extend);
44188
44189 EVT VT = CMovN.getValueType();
44190 SDValue CMovOp0 = CMovN.getOperand(0);
44191 SDValue CMovOp1 = CMovN.getOperand(1);
44192
44193 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
44194 !isa<ConstantSDNode>(CMovOp1.getNode()))
44195 return SDValue();
44196
44197 // Only extend to i32 or i64.
44198 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
44199 return SDValue();
44200
44201 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
44202 // are free.
44203 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
44204 return SDValue();
44205
44206 // If this a zero extend to i64, we should only extend to i32 and use a free
44207 // zero extend to finish.
44208 EVT ExtendVT = TargetVT;
44209 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
44210 ExtendVT = MVT::i32;
44211
44212 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
44213 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
44214
44215 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
44216 CMovN.getOperand(2), CMovN.getOperand(3));
44217
44218 // Finish extending if needed.
44219 if (ExtendVT != TargetVT)
44220 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
44221
44222 return Res;
44223}
44224
44225// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
44226// This is more or less the reverse of combineBitcastvxi1.
44227static SDValue
44228combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
44229 TargetLowering::DAGCombinerInfo &DCI,
44230 const X86Subtarget &Subtarget) {
44231 unsigned Opcode = N->getOpcode();
44232 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
44233 Opcode != ISD::ANY_EXTEND)
44234 return SDValue();
44235 if (!DCI.isBeforeLegalizeOps())
44236 return SDValue();
44237 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
44238 return SDValue();
44239
44240 SDValue N0 = N->getOperand(0);
44241 EVT VT = N->getValueType(0);
44242 EVT SVT = VT.getScalarType();
44243 EVT InSVT = N0.getValueType().getScalarType();
44244 unsigned EltSizeInBits = SVT.getSizeInBits();
44245
44246 // Input type must be extending a bool vector (bit-casted from a scalar
44247 // integer) to legal integer types.
44248 if (!VT.isVector())
44249 return SDValue();
44250 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
44251 return SDValue();
44252 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
44253 return SDValue();
44254
44255 SDValue N00 = N0.getOperand(0);
44256 EVT SclVT = N0.getOperand(0).getValueType();
44257 if (!SclVT.isScalarInteger())
44258 return SDValue();
44259
44260 SDLoc DL(N);
44261 SDValue Vec;
44262 SmallVector<int, 32> ShuffleMask;
44263 unsigned NumElts = VT.getVectorNumElements();
44264 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")((NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"
) ? static_cast<void> (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44264, __PRETTY_FUNCTION__))
;
44265
44266 // Broadcast the scalar integer to the vector elements.
44267 if (NumElts > EltSizeInBits) {
44268 // If the scalar integer is greater than the vector element size, then we
44269 // must split it down into sub-sections for broadcasting. For example:
44270 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
44271 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
44272 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"
) ? static_cast<void> (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44272, __PRETTY_FUNCTION__))
;
44273 unsigned Scale = NumElts / EltSizeInBits;
44274 EVT BroadcastVT =
44275 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
44276 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44277 Vec = DAG.getBitcast(VT, Vec);
44278
44279 for (unsigned i = 0; i != Scale; ++i)
44280 ShuffleMask.append(EltSizeInBits, i);
44281 } else {
44282 // For smaller scalar integers, we can simply any-extend it to the vector
44283 // element size (we don't care about the upper bits) and broadcast it to all
44284 // elements.
44285 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
44286 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
44287 ShuffleMask.append(NumElts, 0);
44288 }
44289 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44290
44291 // Now, mask the relevant bit in each element.
44292 SmallVector<SDValue, 32> Bits;
44293 for (unsigned i = 0; i != NumElts; ++i) {
44294 int BitIdx = (i % EltSizeInBits);
44295 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
44296 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
44297 }
44298 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
44299 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
44300
44301 // Compare against the bitmask and extend the result.
44302 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
44303 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
44304 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
44305
44306 // For SEXT, this is now done, otherwise shift the result down for
44307 // zero-extension.
44308 if (Opcode == ISD::SIGN_EXTEND)
44309 return Vec;
44310 return DAG.getNode(ISD::SRL, DL, VT, Vec,
44311 DAG.getConstant(EltSizeInBits - 1, DL, VT));
44312}
44313
44314// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
44315// result type.
44316static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
44317 const X86Subtarget &Subtarget) {
44318 SDValue N0 = N->getOperand(0);
44319 EVT VT = N->getValueType(0);
44320 SDLoc dl(N);
44321
44322 // Only do this combine with AVX512 for vector extends.
44323 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
44324 return SDValue();
44325
44326 // Only combine legal element types.
44327 EVT SVT = VT.getVectorElementType();
44328 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
44329 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
44330 return SDValue();
44331
44332 // We can only do this if the vector size in 256 bits or less.
44333 unsigned Size = VT.getSizeInBits();
44334 if (Size > 256)
44335 return SDValue();
44336
44337 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
44338 // that's the only integer compares with we have.
44339 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
44340 if (ISD::isUnsignedIntSetCC(CC))
44341 return SDValue();
44342
44343 // Only do this combine if the extension will be fully consumed by the setcc.
44344 EVT N00VT = N0.getOperand(0).getValueType();
44345 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
44346 if (Size != MatchingVecType.getSizeInBits())
44347 return SDValue();
44348
44349 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
44350
44351 if (N->getOpcode() == ISD::ZERO_EXTEND)
44352 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
44353
44354 return Res;
44355}
44356
44357static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
44358 TargetLowering::DAGCombinerInfo &DCI,
44359 const X86Subtarget &Subtarget) {
44360 SDValue N0 = N->getOperand(0);
44361 EVT VT = N->getValueType(0);
44362 EVT InVT = N0.getValueType();
44363 SDLoc DL(N);
44364
44365 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
44366 if (!DCI.isBeforeLegalizeOps() &&
44367 N0.getOpcode() == X86ISD::SETCC_CARRY) {
44368 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
44369 N0->getOperand(1));
44370 bool ReplaceOtherUses = !N0.hasOneUse();
44371 DCI.CombineTo(N, Setcc);
44372 // Replace other uses with a truncate of the widened setcc_carry.
44373 if (ReplaceOtherUses) {
44374 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
44375 N0.getValueType(), Setcc);
44376 DCI.CombineTo(N0.getNode(), Trunc);
44377 }
44378
44379 return SDValue(N, 0);
44380 }
44381
44382 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
44383 return NewCMov;
44384
44385 if (!DCI.isBeforeLegalizeOps())
44386 return SDValue();
44387
44388 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
44389 return V;
44390
44391 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
44392 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
44393 // Invert and sign-extend a boolean is the same as zero-extend and subtract
44394 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
44395 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
44396 // sext (xor Bool, -1) --> sub (zext Bool), 1
44397 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
44398 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
44399 }
44400
44401 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
44402 return V;
44403
44404 if (VT.isVector())
44405 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
44406 return R;
44407
44408 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
44409 return NewAdd;
44410
44411 return SDValue();
44412}
44413
44414static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
44415 TargetLowering::DAGCombinerInfo &DCI,
44416 const X86Subtarget &Subtarget) {
44417 SDLoc dl(N);
44418 EVT VT = N->getValueType(0);
44419 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
44420
44421 // Let legalize expand this if it isn't a legal type yet.
44422 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44423 if (!TLI.isTypeLegal(VT))
44424 return SDValue();
44425
44426 EVT ScalarVT = VT.getScalarType();
44427 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
44428 return SDValue();
44429
44430 SDValue A = N->getOperand(IsStrict ? 1 : 0);
44431 SDValue B = N->getOperand(IsStrict ? 2 : 1);
44432 SDValue C = N->getOperand(IsStrict ? 3 : 2);
44433
44434 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
44435 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
44436 bool LegalOperations = !DCI.isBeforeLegalizeOps();
44437 if (TLI.getNegatibleCost(V, DAG, LegalOperations, CodeSize) ==
44438 TargetLowering::NegatibleCost::Cheaper) {
44439 V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);
44440 return true;
44441 }
44442 // Look through extract_vector_elts. If it comes from an FNEG, create a
44443 // new extract from the FNEG input.
44444 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44445 isNullConstant(V.getOperand(1))) {
44446 SDValue Vec = V.getOperand(0);
44447 if (TLI.getNegatibleCost(Vec, DAG, LegalOperations, CodeSize) ==
44448 TargetLowering::NegatibleCost::Cheaper) {
44449 SDValue NegVal =
44450 TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);
44451 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
44452 NegVal, V.getOperand(1));
44453 return true;
44454 }
44455 }
44456
44457 return false;
44458 };
44459
44460 // Do not convert the passthru input of scalar intrinsics.
44461 // FIXME: We could allow negations of the lower element only.
44462 bool NegA = invertIfNegative(A);
44463 bool NegB = invertIfNegative(B);
44464 bool NegC = invertIfNegative(C);
44465
44466 if (!NegA && !NegB && !NegC)
44467 return SDValue();
44468
44469 unsigned NewOpcode =
44470 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
44471
44472 if (IsStrict) {
44473 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")((N->getNumOperands() == 4 && "Shouldn't be greater than 4"
) ? static_cast<void> (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44473, __PRETTY_FUNCTION__))
;
44474 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
44475 {N->getOperand(0), A, B, C});
44476 } else {
44477 if (N->getNumOperands() == 4)
44478 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
44479 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
44480 }
44481}
44482
44483// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
44484// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
44485static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
44486 TargetLowering::DAGCombinerInfo &DCI) {
44487 SDLoc dl(N);
44488 EVT VT = N->getValueType(0);
44489 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44490 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
44491 bool LegalOperations = !DCI.isBeforeLegalizeOps();
44492
44493 SDValue N2 = N->getOperand(2);
44494 if (TLI.getNegatibleCost(N2, DAG, LegalOperations, CodeSize) !=
44495 TargetLowering::NegatibleCost::Cheaper)
44496 return SDValue();
44497
44498 SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);
44499 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
44500
44501 if (N->getNumOperands() == 4)
44502 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
44503 NegN2, N->getOperand(3));
44504 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
44505 NegN2);
44506}
44507
44508static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
44509 TargetLowering::DAGCombinerInfo &DCI,
44510 const X86Subtarget &Subtarget) {
44511 SDLoc dl(N);
44512 SDValue N0 = N->getOperand(0);
44513 EVT VT = N->getValueType(0);
44514
44515 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
44516 // FIXME: Is this needed? We don't seem to have any tests for it.
44517 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
44518 N0.getOpcode() == X86ISD::SETCC_CARRY) {
44519 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
44520 N0->getOperand(1));
44521 bool ReplaceOtherUses = !N0.hasOneUse();
44522 DCI.CombineTo(N, Setcc);
44523 // Replace other uses with a truncate of the widened setcc_carry.
44524 if (ReplaceOtherUses) {
44525 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
44526 N0.getValueType(), Setcc);
44527 DCI.CombineTo(N0.getNode(), Trunc);
44528 }
44529
44530 return SDValue(N, 0);
44531 }
44532
44533 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
44534 return NewCMov;
44535
44536 if (DCI.isBeforeLegalizeOps())
44537 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
44538 return V;
44539
44540 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
44541 return V;
44542
44543 if (VT.isVector())
44544 if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
44545 return R;
44546
44547 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
44548 return NewAdd;
44549
44550 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
44551 return R;
44552
44553 // TODO: Combine with any target/faux shuffle.
44554 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
44555 VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {
44556 SDValue N00 = N0.getOperand(0);
44557 SDValue N01 = N0.getOperand(1);
44558 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
44559 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
44560 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
44561 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
44562 return concatSubVectors(N00, N01, DAG, dl);
44563 }
44564 }
44565
44566 return SDValue();
44567}
44568
44569/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
44570/// recognizable memcmp expansion.
44571static bool isOrXorXorTree(SDValue X, bool Root = true) {
44572 if (X.getOpcode() == ISD::OR)
44573 return isOrXorXorTree(X.getOperand(0), false) &&
44574 isOrXorXorTree(X.getOperand(1), false);
44575 if (Root)
44576 return false;
44577 return X.getOpcode() == ISD::XOR;
44578}
44579
44580/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
44581/// expansion.
44582template<typename F>
44583static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
44584 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
44585 SDValue Op0 = X.getOperand(0);
44586 SDValue Op1 = X.getOperand(1);
44587 if (X.getOpcode() == ISD::OR) {
44588 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
44589 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
44590 if (VecVT != CmpVT)
44591 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
44592 if (HasPT)
44593 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
44594 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
44595 } else if (X.getOpcode() == ISD::XOR) {
44596 SDValue A = SToV(Op0);
44597 SDValue B = SToV(Op1);
44598 if (VecVT != CmpVT)
44599 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
44600 if (HasPT)
44601 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
44602 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
44603 }
44604 llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44604)
;
44605}
44606
44607/// Try to map a 128-bit or larger integer comparison to vector instructions
44608/// before type legalization splits it up into chunks.
44609static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
44610 const X86Subtarget &Subtarget) {
44611 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
44612 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate"
) ? static_cast<void> (0) : __assert_fail ("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44612, __PRETTY_FUNCTION__))
;
44613
44614 // We're looking for an oversized integer equality comparison.
44615 SDValue X = SetCC->getOperand(0);
44616 SDValue Y = SetCC->getOperand(1);
44617 EVT OpVT = X.getValueType();
44618 unsigned OpSize = OpVT.getSizeInBits();
44619 if (!OpVT.isScalarInteger() || OpSize < 128)
44620 return SDValue();
44621
44622 // Ignore a comparison with zero because that gets special treatment in
44623 // EmitTest(). But make an exception for the special case of a pair of
44624 // logically-combined vector-sized operands compared to zero. This pattern may
44625 // be generated by the memcmp expansion pass with oversized integer compares
44626 // (see PR33325).
44627 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
44628 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
44629 return SDValue();
44630
44631 // Don't perform this combine if constructing the vector will be expensive.
44632 auto IsVectorBitCastCheap = [](SDValue X) {
44633 X = peekThroughBitcasts(X);
44634 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
44635 X.getOpcode() == ISD::LOAD;
44636 };
44637 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
44638 !IsOrXorXorTreeCCZero)
44639 return SDValue();
44640
44641 EVT VT = SetCC->getValueType(0);
44642 SDLoc DL(SetCC);
44643 bool HasAVX = Subtarget.hasAVX();
44644
44645 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
44646 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
44647 // Otherwise use PCMPEQ (plus AND) and mask testing.
44648 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
44649 (OpSize == 256 && HasAVX) ||
44650 (OpSize == 512 && Subtarget.useAVX512Regs())) {
44651 bool HasPT = Subtarget.hasSSE41();
44652
44653 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
44654 // vector registers are essentially free. (Technically, widening registers
44655 // prevents load folding, but the tradeoff is worth it.)
44656 bool PreferKOT = Subtarget.preferMaskRegisters();
44657 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
44658
44659 EVT VecVT = MVT::v16i8;
44660 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
44661 if (OpSize == 256) {
44662 VecVT = MVT::v32i8;
44663 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
44664 }
44665 EVT CastVT = VecVT;
44666 bool NeedsAVX512FCast = false;
44667 if (OpSize == 512 || NeedZExt) {
44668 if (Subtarget.hasBWI()) {
44669 VecVT = MVT::v64i8;
44670 CmpVT = MVT::v64i1;
44671 if (OpSize == 512)
44672 CastVT = VecVT;
44673 } else {
44674 VecVT = MVT::v16i32;
44675 CmpVT = MVT::v16i1;
44676 CastVT = OpSize == 512 ? VecVT :
44677 OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
44678 NeedsAVX512FCast = true;
44679 }
44680 }
44681
44682 auto ScalarToVector = [&](SDValue X) -> SDValue {
44683 bool TmpZext = false;
44684 EVT TmpCastVT = CastVT;
44685 if (X.getOpcode() == ISD::ZERO_EXTEND) {
44686 SDValue OrigX = X.getOperand(0);
44687 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
44688 if (OrigSize < OpSize) {
44689 if (OrigSize == 128) {
44690 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
44691 X = OrigX;
44692 TmpZext = true;
44693 } else if (OrigSize == 256) {
44694 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
44695 X = OrigX;
44696 TmpZext = true;
44697 }
44698 }
44699 }
44700 X = DAG.getBitcast(TmpCastVT, X);
44701 if (!NeedZExt && !TmpZext)
44702 return X;
44703 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
44704 DAG.getConstant(0, DL, VecVT), X,
44705 DAG.getVectorIdxConstant(0, DL));
44706 };
44707
44708 SDValue Cmp;
44709 if (IsOrXorXorTreeCCZero) {
44710 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
44711 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
44712 // Use 2 vector equality compares and 'and' the results before doing a
44713 // MOVMSK.
44714 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
44715 } else {
44716 SDValue VecX = ScalarToVector(X);
44717 SDValue VecY = ScalarToVector(Y);
44718 if (VecVT != CmpVT) {
44719 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
44720 } else if (HasPT) {
44721 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
44722 } else {
44723 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
44724 }
44725 }
44726 // AVX512 should emit a setcc that will lower to kortest.
44727 if (VecVT != CmpVT) {
44728 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
44729 CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
44730 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
44731 DAG.getConstant(0, DL, KRegVT), CC);
44732 }
44733 if (HasPT) {
44734 SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
44735 Cmp);
44736 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
44737 X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
44738 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
44739 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
44740 }
44741 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
44742 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
44743 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
44744 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
44745 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
44746 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
44747 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
44748 MVT::i32);
44749 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
44750 }
44751
44752 return SDValue();
44753}
44754
44755static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
44756 const X86Subtarget &Subtarget) {
44757 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
44758 const SDValue LHS = N->getOperand(0);
44759 const SDValue RHS = N->getOperand(1);
44760 EVT VT = N->getValueType(0);
44761 EVT OpVT = LHS.getValueType();
44762 SDLoc DL(N);
44763
44764 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
44765 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
44766 return V;
44767 }
44768
44769 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
44770 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
44771 // Using temporaries to avoid messing up operand ordering for later
44772 // transformations if this doesn't work.
44773 SDValue Op0 = LHS;
44774 SDValue Op1 = RHS;
44775 ISD::CondCode TmpCC = CC;
44776 // Put build_vector on the right.
44777 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
44778 std::swap(Op0, Op1);
44779 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
44780 }
44781
44782 bool IsSEXT0 =
44783 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
44784 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
44785 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
44786
44787 if (IsSEXT0 && IsVZero1) {
44788 assert(VT == Op0.getOperand(0).getValueType() &&((VT == Op0.getOperand(0).getValueType() && "Uexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == Op0.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44789, __PRETTY_FUNCTION__))
44789 "Uexpected operand type")((VT == Op0.getOperand(0).getValueType() && "Uexpected operand type"
) ? static_cast<void> (0) : __assert_fail ("VT == Op0.getOperand(0).getValueType() && \"Uexpected operand type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44789, __PRETTY_FUNCTION__))
;
44790 if (TmpCC == ISD::SETGT)
44791 return DAG.getConstant(0, DL, VT);
44792 if (TmpCC == ISD::SETLE)
44793 return DAG.getConstant(1, DL, VT);
44794 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
44795 return DAG.getNOT(DL, Op0.getOperand(0), VT);
44796
44797 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44798, __PRETTY_FUNCTION__))
44798 "Unexpected condition code!")(((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && "Unexpected condition code!"
) ? static_cast<void> (0) : __assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44798, __PRETTY_FUNCTION__))
;
44799 return Op0.getOperand(0);
44800 }
44801 }
44802
44803 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
44804 // pre-promote its result type since vXi1 vectors don't get promoted
44805 // during type legalization.
44806 // NOTE: The element count check is to ignore operand types that need to
44807 // go through type promotion to a 128-bit vector.
44808 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
44809 VT.getVectorElementType() == MVT::i1 &&
44810 (OpVT.getVectorElementType() == MVT::i8 ||
44811 OpVT.getVectorElementType() == MVT::i16)) {
44812 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
44813 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
44814 }
44815
44816 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
44817 // to avoid scalarization via legalization because v4i32 is not a legal type.
44818 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
44819 LHS.getValueType() == MVT::v4f32)
44820 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
44821
44822 return SDValue();
44823}
44824
44825static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
44826 TargetLowering::DAGCombinerInfo &DCI,
44827 const X86Subtarget &Subtarget) {
44828 SDValue Src = N->getOperand(0);
44829 MVT SrcVT = Src.getSimpleValueType();
44830 MVT VT = N->getSimpleValueType(0);
44831 unsigned NumBits = VT.getScalarSizeInBits();
44832 unsigned NumElts = SrcVT.getVectorNumElements();
44833
44834 // Perform constant folding.
44835 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
44836 assert(VT == MVT::i32 && "Unexpected result type")((VT == MVT::i32 && "Unexpected result type") ? static_cast
<void> (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected result type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44836, __PRETTY_FUNCTION__))
;
44837 APInt Imm(32, 0);
44838 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
44839 if (!Src.getOperand(Idx).isUndef() &&
44840 Src.getConstantOperandAPInt(Idx).isNegative())
44841 Imm.setBit(Idx);
44842 }
44843 return DAG.getConstant(Imm, SDLoc(N), VT);
44844 }
44845
44846 // Look through int->fp bitcasts that don't change the element width.
44847 unsigned EltWidth = SrcVT.getScalarSizeInBits();
44848 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
44849 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
44850 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
44851
44852 // Fold movmsk(not(x)) -> not(movmsk) to improve folding of movmsk results
44853 // with scalar comparisons.
44854 if (SDValue NotSrc = IsNOT(Src, DAG)) {
44855 SDLoc DL(N);
44856 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
44857 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
44858 return DAG.getNode(ISD::XOR, DL, VT,
44859 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
44860 DAG.getConstant(NotMask, DL, VT));
44861 }
44862
44863 // Simplify the inputs.
44864 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44865 APInt DemandedMask(APInt::getAllOnesValue(NumBits));
44866 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
44867 return SDValue(N, 0);
44868
44869 return SDValue();
44870}
44871
44872static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
44873 TargetLowering::DAGCombinerInfo &DCI) {
44874 // With vector masks we only demand the upper bit of the mask.
44875 SDValue Mask = cast<X86MaskedGatherScatterSDNode>(N)->getMask();
44876 if (Mask.getScalarValueSizeInBits() != 1) {
44877 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44878 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
44879 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
44880 if (N->getOpcode() != ISD::DELETED_NODE)
44881 DCI.AddToWorklist(N);
44882 return SDValue(N, 0);
44883 }
44884 }
44885
44886 return SDValue();
44887}
44888
44889static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
44890 SDValue Index, SDValue Base, SDValue Scale,
44891 SelectionDAG &DAG) {
44892 SDLoc DL(GorS);
44893
44894 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
44895 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
44896 Gather->getMask(), Base, Index, Scale } ;
44897 return DAG.getMaskedGather(Gather->getVTList(),
44898 Gather->getMemoryVT(), DL, Ops,
44899 Gather->getMemOperand(),
44900 Gather->getIndexType());
44901 }
44902 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
44903 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
44904 Scatter->getMask(), Base, Index, Scale };
44905 return DAG.getMaskedScatter(Scatter->getVTList(),
44906 Scatter->getMemoryVT(), DL,
44907 Ops, Scatter->getMemOperand(),
44908 Scatter->getIndexType());
44909}
44910
44911static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
44912 TargetLowering::DAGCombinerInfo &DCI) {
44913 SDLoc DL(N);
44914 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
44915 SDValue Index = GorS->getIndex();
44916 SDValue Base = GorS->getBasePtr();
44917 SDValue Scale = GorS->getScale();
44918
44919 if (DCI.isBeforeLegalize()) {
44920 unsigned IndexWidth = Index.getScalarValueSizeInBits();
44921
44922 // Shrink constant indices if they are larger than 32-bits.
44923 // Only do this before legalize types since v2i64 could become v2i32.
44924 // FIXME: We could check that the type is legal if we're after legalize
44925 // types, but then we would need to construct test cases where that happens.
44926 // FIXME: We could support more than just constant vectors, but we need to
44927 // careful with costing. A truncate that can be optimized out would be fine.
44928 // Otherwise we might only want to create a truncate if it avoids a split.
44929 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
44930 if (BV->isConstant() && IndexWidth > 32 &&
44931 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
44932 unsigned NumElts = Index.getValueType().getVectorNumElements();
44933 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
44934 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
44935 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
44936 }
44937 }
44938
44939 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
44940 // there are sufficient sign bits. Only do this before legalize types to
44941 // avoid creating illegal types in truncate.
44942 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
44943 Index.getOpcode() == ISD::ZERO_EXTEND) &&
44944 IndexWidth > 32 &&
44945 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
44946 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
44947 unsigned NumElts = Index.getValueType().getVectorNumElements();
44948 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
44949 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
44950 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
44951 }
44952 }
44953
44954 if (DCI.isBeforeLegalizeOps()) {
44955 unsigned IndexWidth = Index.getScalarValueSizeInBits();
44956
44957 // Make sure the index is either i32 or i64
44958 if (IndexWidth != 32 && IndexWidth != 64) {
44959 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
44960 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
44961 Index.getValueType().getVectorNumElements());
44962 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
44963 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
44964 }
44965 }
44966
44967 // With vector masks we only demand the upper bit of the mask.
44968 SDValue Mask = GorS->getMask();
44969 if (Mask.getScalarValueSizeInBits() != 1) {
44970 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44971 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
44972 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
44973 if (N->getOpcode() != ISD::DELETED_NODE)
44974 DCI.AddToWorklist(N);
44975 return SDValue(N, 0);
44976 }
44977 }
44978
44979 return SDValue();
44980}
44981
44982// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
44983static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
44984 const X86Subtarget &Subtarget) {
44985 SDLoc DL(N);
44986 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
44987 SDValue EFLAGS = N->getOperand(1);
44988
44989 // Try to simplify the EFLAGS and condition code operands.
44990 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
44991 return getSETCC(CC, Flags, DL, DAG);
44992
44993 return SDValue();
44994}
44995
44996/// Optimize branch condition evaluation.
44997static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
44998 const X86Subtarget &Subtarget) {
44999 SDLoc DL(N);
45000 SDValue EFLAGS = N->getOperand(3);
45001 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
45002
45003 // Try to simplify the EFLAGS and condition code operands.
45004 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
45005 // RAUW them under us.
45006 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
45007 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
45008 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
45009 N->getOperand(1), Cond, Flags);
45010 }
45011
45012 return SDValue();
45013}
45014
45015static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
45016 SelectionDAG &DAG) {
45017 // Take advantage of vector comparisons producing 0 or -1 in each lane to
45018 // optimize away operation when it's from a constant.
45019 //
45020 // The general transformation is:
45021 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
45022 // AND(VECTOR_CMP(x,y), constant2)
45023 // constant2 = UNARYOP(constant)
45024
45025 // Early exit if this isn't a vector operation, the operand of the
45026 // unary operation isn't a bitwise AND, or if the sizes of the operations
45027 // aren't the same.
45028 EVT VT = N->getValueType(0);
45029 bool IsStrict = N->isStrictFPOpcode();
45030 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
45031 if (!VT.isVector() || Op0->getOpcode() != ISD::AND ||
45032 Op0->getOperand(0)->getOpcode() != ISD::SETCC ||
45033 VT.getSizeInBits() != Op0.getValueSizeInBits())
45034 return SDValue();
45035
45036 // Now check that the other operand of the AND is a constant. We could
45037 // make the transformation for non-constant splats as well, but it's unclear
45038 // that would be a benefit as it would not eliminate any operations, just
45039 // perform one more step in scalar code before moving to the vector unit.
45040 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
45041 // Bail out if the vector isn't a constant.
45042 if (!BV->isConstant())
45043 return SDValue();
45044
45045 // Everything checks out. Build up the new and improved node.
45046 SDLoc DL(N);
45047 EVT IntVT = BV->getValueType(0);
45048 // Create a new constant of the appropriate type for the transformed
45049 // DAG.
45050 SDValue SourceConst;
45051 if (IsStrict)
45052 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
45053 {N->getOperand(0), SDValue(BV, 0)});
45054 else
45055 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
45056 // The AND node needs bitcasts to/from an integer vector type around it.
45057 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
45058 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
45059 MaskConst);
45060 SDValue Res = DAG.getBitcast(VT, NewAnd);
45061 if (IsStrict)
45062 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
45063 return Res;
45064 }
45065
45066 return SDValue();
45067}
45068
45069/// If we are converting a value to floating-point, try to replace scalar
45070/// truncate of an extracted vector element with a bitcast. This tries to keep
45071/// the sequence on XMM registers rather than moving between vector and GPRs.
45072static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
45073 // TODO: This is currently only used by combineSIntToFP, but it is generalized
45074 // to allow being called by any similar cast opcode.
45075 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
45076 SDValue Trunc = N->getOperand(0);
45077 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
45078 return SDValue();
45079
45080 SDValue ExtElt = Trunc.getOperand(0);
45081 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45082 !isNullConstant(ExtElt.getOperand(1)))
45083 return SDValue();
45084
45085 EVT TruncVT = Trunc.getValueType();
45086 EVT SrcVT = ExtElt.getValueType();
45087 unsigned DestWidth = TruncVT.getSizeInBits();
45088 unsigned SrcWidth = SrcVT.getSizeInBits();
45089 if (SrcWidth % DestWidth != 0)
45090 return SDValue();
45091
45092 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
45093 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
45094 unsigned VecWidth = SrcVecVT.getSizeInBits();
45095 unsigned NumElts = VecWidth / DestWidth;
45096 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
45097 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
45098 SDLoc DL(N);
45099 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
45100 BitcastVec, ExtElt.getOperand(1));
45101 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
45102}
45103
45104static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
45105 const X86Subtarget &Subtarget) {
45106 bool IsStrict = N->isStrictFPOpcode();
45107 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
45108 EVT VT = N->getValueType(0);
45109 EVT InVT = Op0.getValueType();
45110
45111 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
45112 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
45113 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
45114 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
45115 SDLoc dl(N);
45116 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
45117 InVT.getVectorNumElements());
45118 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
45119
45120 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
45121 if (IsStrict)
45122 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
45123 {N->getOperand(0), P});
45124 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
45125 }
45126
45127 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
45128 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
45129 // the optimization here.
45130 if (DAG.SignBitIsZero(Op0)) {
45131 if (IsStrict)
45132 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
45133 {N->getOperand(0), Op0});
45134 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
45135 }
45136
45137 return SDValue();
45138}
45139
45140static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
45141 TargetLowering::DAGCombinerInfo &DCI,
45142 const X86Subtarget &Subtarget) {
45143 // First try to optimize away the conversion entirely when it's
45144 // conditionally from a constant. Vectors only.
45145 bool IsStrict = N->isStrictFPOpcode();
45146 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
45147 return Res;
45148
45149 // Now move on to more general possibilities.
45150 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
45151 EVT VT = N->getValueType(0);
45152 EVT InVT = Op0.getValueType();
45153
45154 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
45155 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
45156 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
45157 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
45158 SDLoc dl(N);
45159 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
45160 InVT.getVectorNumElements());
45161 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
45162 if (IsStrict)
45163 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
45164 {N->getOperand(0), P});
45165 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
45166 }
45167
45168 // Without AVX512DQ we only support i64 to float scalar conversion. For both
45169 // vectors and scalars, see if we know that the upper bits are all the sign
45170 // bit, in which case we can truncate the input to i32 and convert from that.
45171 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
45172 unsigned BitWidth = InVT.getScalarSizeInBits();
45173 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
45174 if (NumSignBits >= (BitWidth - 31)) {
45175 EVT TruncVT = MVT::i32;
45176 if (InVT.isVector())
45177 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
45178 InVT.getVectorNumElements());
45179 SDLoc dl(N);
45180 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
45181 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
45182 if (IsStrict)
45183 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
45184 {N->getOperand(0), Trunc});
45185 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
45186 }
45187 // If we're after legalize and the type is v2i32 we need to shuffle and
45188 // use CVTSI2P.
45189 assert(InVT == MVT::v2i64 && "Unexpected VT!")((InVT == MVT::v2i64 && "Unexpected VT!") ? static_cast
<void> (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45189, __PRETTY_FUNCTION__))
;
45190 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
45191 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
45192 { 0, 2, -1, -1 });
45193 if (IsStrict)
45194 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
45195 {N->getOperand(0), Shuf});
45196 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
45197 }
45198 }
45199
45200 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
45201 // a 32-bit target where SSE doesn't support i64->FP operations.
45202 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
45203 Op0.getOpcode() == ISD::LOAD) {
45204 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
45205
45206 // This transformation is not supported if the result type is f16 or f128.
45207 if (VT == MVT::f16 || VT == MVT::f128)
45208 return SDValue();
45209
45210 // If we have AVX512DQ we can use packed conversion instructions unless
45211 // the VT is f80.
45212 if (Subtarget.hasDQI() && VT != MVT::f80)
45213 return SDValue();
45214
45215 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
45216 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
45217 std::pair<SDValue, SDValue> Tmp =
45218 Subtarget.getTargetLowering()->BuildFILD(
45219 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
45220 Ld->getPointerInfo(), Ld->getAlignment(), DAG);
45221 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
45222 return Tmp.first;
45223 }
45224 }
45225
45226 if (IsStrict)
45227 return SDValue();
45228
45229 if (SDValue V = combineToFPTruncExtElt(N, DAG))
45230 return V;
45231
45232 return SDValue();
45233}
45234
45235static bool needCarryOrOverflowFlag(SDValue Flags) {
45236 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((Flags.getValueType() == MVT::i32 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45236, __PRETTY_FUNCTION__))
;
45237
45238 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
45239 UI != UE; ++UI) {
45240 SDNode *User = *UI;
45241
45242 X86::CondCode CC;
45243 switch (User->getOpcode()) {
45244 default:
45245 // Be conservative.
45246 return true;
45247 case X86ISD::SETCC:
45248 case X86ISD::SETCC_CARRY:
45249 CC = (X86::CondCode)User->getConstantOperandVal(0);
45250 break;
45251 case X86ISD::BRCOND:
45252 CC = (X86::CondCode)User->getConstantOperandVal(2);
45253 break;
45254 case X86ISD::CMOV:
45255 CC = (X86::CondCode)User->getConstantOperandVal(2);
45256 break;
45257 }
45258
45259 switch (CC) {
45260 default: break;
45261 case X86::COND_A: case X86::COND_AE:
45262 case X86::COND_B: case X86::COND_BE:
45263 case X86::COND_O: case X86::COND_NO:
45264 case X86::COND_G: case X86::COND_GE:
45265 case X86::COND_L: case X86::COND_LE:
45266 return true;
45267 }
45268 }
45269
45270 return false;
45271}
45272
45273static bool onlyZeroFlagUsed(SDValue Flags) {
45274 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")((Flags.getValueType() == MVT::i32 && "Unexpected VT!"
) ? static_cast<void> (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45274, __PRETTY_FUNCTION__))
;
45275
45276 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
45277 UI != UE; ++UI) {
45278 SDNode *User = *UI;
45279
45280 unsigned CCOpNo;
45281 switch (User->getOpcode()) {
45282 default:
45283 // Be conservative.
45284 return false;
45285 case X86ISD::SETCC: CCOpNo = 0; break;
45286 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
45287 case X86ISD::BRCOND: CCOpNo = 2; break;
45288 case X86ISD::CMOV: CCOpNo = 2; break;
45289 }
45290
45291 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
45292 if (CC != X86::COND_E && CC != X86::COND_NE)
45293 return false;
45294 }
45295
45296 return true;
45297}
45298
45299static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
45300 // Only handle test patterns.
45301 if (!isNullConstant(N->getOperand(1)))
45302 return SDValue();
45303
45304 // If we have a CMP of a truncated binop, see if we can make a smaller binop
45305 // and use its flags directly.
45306 // TODO: Maybe we should try promoting compares that only use the zero flag
45307 // first if we can prove the upper bits with computeKnownBits?
45308 SDLoc dl(N);
45309 SDValue Op = N->getOperand(0);
45310 EVT VT = Op.getValueType();
45311
45312 // If we have a constant logical shift that's only used in a comparison
45313 // against zero turn it into an equivalent AND. This allows turning it into
45314 // a TEST instruction later.
45315 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
45316 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
45317 onlyZeroFlagUsed(SDValue(N, 0))) {
45318 unsigned BitWidth = VT.getSizeInBits();
45319 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
45320 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
45321 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
45322 APInt Mask = Op.getOpcode() == ISD::SRL
45323 ? APInt::getHighBitsSet(BitWidth, MaskBits)
45324 : APInt::getLowBitsSet(BitWidth, MaskBits);
45325 if (Mask.isSignedIntN(32)) {
45326 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
45327 DAG.getConstant(Mask, dl, VT));
45328 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
45329 DAG.getConstant(0, dl, VT));
45330 }
45331 }
45332 }
45333
45334 // Look for a truncate with a single use.
45335 if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
45336 return SDValue();
45337
45338 Op = Op.getOperand(0);
45339
45340 // Arithmetic op can only have one use.
45341 if (!Op.hasOneUse())
45342 return SDValue();
45343
45344 unsigned NewOpc;
45345 switch (Op.getOpcode()) {
45346 default: return SDValue();
45347 case ISD::AND:
45348 // Skip and with constant. We have special handling for and with immediate
45349 // during isel to generate test instructions.
45350 if (isa<ConstantSDNode>(Op.getOperand(1)))
45351 return SDValue();
45352 NewOpc = X86ISD::AND;
45353 break;
45354 case ISD::OR: NewOpc = X86ISD::OR; break;
45355 case ISD::XOR: NewOpc = X86ISD::XOR; break;
45356 case ISD::ADD:
45357 // If the carry or overflow flag is used, we can't truncate.
45358 if (needCarryOrOverflowFlag(SDValue(N, 0)))
45359 return SDValue();
45360 NewOpc = X86ISD::ADD;
45361 break;
45362 case ISD::SUB:
45363 // If the carry or overflow flag is used, we can't truncate.
45364 if (needCarryOrOverflowFlag(SDValue(N, 0)))
45365 return SDValue();
45366 NewOpc = X86ISD::SUB;
45367 break;
45368 }
45369
45370 // We found an op we can narrow. Truncate its inputs.
45371 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
45372 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
45373
45374 // Use a X86 specific opcode to avoid DAG combine messing with it.
45375 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
45376 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
45377
45378 // For AND, keep a CMP so that we can match the test pattern.
45379 if (NewOpc == X86ISD::AND)
45380 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
45381 DAG.getConstant(0, dl, VT));
45382
45383 // Return the flags.
45384 return Op.getValue(1);
45385}
45386
45387static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
45388 TargetLowering::DAGCombinerInfo &DCI) {
45389 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode
()) && "Expected X86ISD::ADD or X86ISD::SUB") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45390, __PRETTY_FUNCTION__))
45390 "Expected X86ISD::ADD or X86ISD::SUB")(((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode
()) && "Expected X86ISD::ADD or X86ISD::SUB") ? static_cast
<void> (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45390, __PRETTY_FUNCTION__))
;
45391
45392 SDLoc DL(N);
45393 SDValue LHS = N->getOperand(0);
45394 SDValue RHS = N->getOperand(1);
45395 MVT VT = LHS.getSimpleValueType();
45396 unsigned GenericOpc = X86ISD::ADD == N->getOpcode() ? ISD::ADD : ISD::SUB;
45397
45398 // If we don't use the flag result, simplify back to a generic ADD/SUB.
45399 if (!N->hasAnyUseOfValue(1)) {
45400 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
45401 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
45402 }
45403
45404 // Fold any similar generic ADD/SUB opcodes to reuse this node.
45405 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
45406 SDValue Ops[] = {N0, N1};
45407 SDVTList VTs = DAG.getVTList(N->getValueType(0));
45408 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
45409 SDValue Op(N, 0);
45410 if (Negate)
45411 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
45412 DCI.CombineTo(GenericAddSub, Op);
45413 }
45414 };
45415 MatchGeneric(LHS, RHS, false);
45416 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
45417
45418 return SDValue();
45419}
45420
45421static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
45422 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
45423 MVT VT = N->getSimpleValueType(0);
45424 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
45425 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
45426 N->getOperand(0), N->getOperand(1),
45427 Flags);
45428 }
45429
45430 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
45431 // iff the flag result is dead.
45432 SDValue Op0 = N->getOperand(0);
45433 SDValue Op1 = N->getOperand(1);
45434 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op1) &&
45435 !N->hasAnyUseOfValue(1))
45436 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), Op0.getOperand(0),
45437 Op0.getOperand(1), N->getOperand(2));
45438
45439 return SDValue();
45440}
45441
45442// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
45443static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
45444 TargetLowering::DAGCombinerInfo &DCI) {
45445 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
45446 // the result is either zero or one (depending on the input carry bit).
45447 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
45448 if (X86::isZeroNode(N->getOperand(0)) &&
45449 X86::isZeroNode(N->getOperand(1)) &&
45450 // We don't have a good way to replace an EFLAGS use, so only do this when
45451 // dead right now.
45452 SDValue(N, 1).use_empty()) {
45453 SDLoc DL(N);
45454 EVT VT = N->getValueType(0);
45455 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
45456 SDValue Res1 =
45457 DAG.getNode(ISD::AND, DL, VT,
45458 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
45459 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
45460 N->getOperand(2)),
45461 DAG.getConstant(1, DL, VT));
45462 return DCI.CombineTo(N, Res1, CarryOut);
45463 }
45464
45465 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2), DAG)) {
45466 MVT VT = N->getSimpleValueType(0);
45467 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
45468 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
45469 N->getOperand(0), N->getOperand(1),
45470 Flags);
45471 }
45472
45473 return SDValue();
45474}
45475
45476/// If this is an add or subtract where one operand is produced by a cmp+setcc,
45477/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
45478/// with CMP+{ADC, SBB}.
45479static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
45480 bool IsSub = N->getOpcode() == ISD::SUB;
45481 SDValue X = N->getOperand(0);
45482 SDValue Y = N->getOperand(1);
45483
45484 // If this is an add, canonicalize a zext operand to the RHS.
45485 // TODO: Incomplete? What if both sides are zexts?
45486 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
45487 Y.getOpcode() != ISD::ZERO_EXTEND)
45488 std::swap(X, Y);
45489
45490 // Look through a one-use zext.
45491 bool PeekedThroughZext = false;
45492 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
45493 Y = Y.getOperand(0);
45494 PeekedThroughZext = true;
45495 }
45496
45497 // If this is an add, canonicalize a setcc operand to the RHS.
45498 // TODO: Incomplete? What if both sides are setcc?
45499 // TODO: Should we allow peeking through a zext of the other operand?
45500 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
45501 Y.getOpcode() != X86ISD::SETCC)
45502 std::swap(X, Y);
45503
45504 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
45505 return SDValue();
45506
45507 SDLoc DL(N);
45508 EVT VT = N->getValueType(0);
45509 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
45510
45511 // If X is -1 or 0, then we have an opportunity to avoid constants required in
45512 // the general case below.
45513 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
45514 if (ConstantX) {
45515 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
45516 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
45517 // This is a complicated way to get -1 or 0 from the carry flag:
45518 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
45519 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
45520 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
45521 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
45522 Y.getOperand(1));
45523 }
45524
45525 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
45526 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
45527 SDValue EFLAGS = Y->getOperand(1);
45528 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
45529 EFLAGS.getValueType().isInteger() &&
45530 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
45531 // Swap the operands of a SUB, and we have the same pattern as above.
45532 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
45533 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
45534 SDValue NewSub = DAG.getNode(
45535 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
45536 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
45537 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
45538 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
45539 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
45540 NewEFLAGS);
45541 }
45542 }
45543 }
45544
45545 if (CC == X86::COND_B) {
45546 // X + SETB Z --> adc X, 0
45547 // X - SETB Z --> sbb X, 0
45548 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
45549 DAG.getVTList(VT, MVT::i32), X,
45550 DAG.getConstant(0, DL, VT), Y.getOperand(1));
45551 }
45552
45553 if (CC == X86::COND_A) {
45554 SDValue EFLAGS = Y->getOperand(1);
45555 // Try to convert COND_A into COND_B in an attempt to facilitate
45556 // materializing "setb reg".
45557 //
45558 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
45559 // cannot take an immediate as its first operand.
45560 //
45561 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
45562 EFLAGS.getValueType().isInteger() &&
45563 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
45564 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
45565 EFLAGS.getNode()->getVTList(),
45566 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
45567 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
45568 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
45569 DAG.getVTList(VT, MVT::i32), X,
45570 DAG.getConstant(0, DL, VT), NewEFLAGS);
45571 }
45572 }
45573
45574 if (CC != X86::COND_E && CC != X86::COND_NE)
45575 return SDValue();
45576
45577 SDValue Cmp = Y.getOperand(1);
45578 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
45579 !X86::isZeroNode(Cmp.getOperand(1)) ||
45580 !Cmp.getOperand(0).getValueType().isInteger())
45581 return SDValue();
45582
45583 SDValue Z = Cmp.getOperand(0);
45584 EVT ZVT = Z.getValueType();
45585
45586 // If X is -1 or 0, then we have an opportunity to avoid constants required in
45587 // the general case below.
45588 if (ConstantX) {
45589 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
45590 // fake operands:
45591 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
45592 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
45593 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
45594 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
45595 SDValue Zero = DAG.getConstant(0, DL, ZVT);
45596 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
45597 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
45598 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
45599 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
45600 SDValue(Neg.getNode(), 1));
45601 }
45602
45603 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
45604 // with fake operands:
45605 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
45606 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
45607 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
45608 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
45609 SDValue One = DAG.getConstant(1, DL, ZVT);
45610 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
45611 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
45612 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
45613 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
45614 Cmp1.getValue(1));
45615 }
45616 }
45617
45618 // (cmp Z, 1) sets the carry flag if Z is 0.
45619 SDValue One = DAG.getConstant(1, DL, ZVT);
45620 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
45621 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
45622
45623 // Add the flags type for ADC/SBB nodes.
45624 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
45625
45626 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
45627 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
45628 if (CC == X86::COND_NE)
45629 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
45630 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
45631
45632 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
45633 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
45634 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
45635 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
45636}
45637
45638static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
45639 const X86Subtarget &Subtarget) {
45640 if (!Subtarget.hasSSE2())
45641 return SDValue();
45642
45643 EVT VT = N->getValueType(0);
45644
45645 // If the vector size is less than 128, or greater than the supported RegSize,
45646 // do not use PMADD.
45647 if (!VT.isVector() || VT.getVectorNumElements() < 8)
45648 return SDValue();
45649
45650 SDValue Op0 = N->getOperand(0);
45651 SDValue Op1 = N->getOperand(1);
45652
45653 auto UsePMADDWD = [&](SDValue Op) {
45654 ShrinkMode Mode;
45655 return Op.getOpcode() == ISD::MUL &&
45656 canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
45657 Mode != ShrinkMode::MULU16 &&
45658 (!Subtarget.hasSSE41() ||
45659 (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
45660 Op->isOnlyUserOf(Op.getOperand(1).getNode())));
45661 };
45662
45663 SDValue MulOp, OtherOp;
45664 if (UsePMADDWD(Op0)) {
45665 MulOp = Op0;
45666 OtherOp = Op1;
45667 } else if (UsePMADDWD(Op1)) {
45668 MulOp = Op1;
45669 OtherOp = Op0;
45670 } else
45671 return SDValue();
45672
45673 SDLoc DL(N);
45674 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
45675 VT.getVectorNumElements());
45676 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
45677 VT.getVectorNumElements() / 2);
45678
45679 // Shrink the operands of mul.
45680 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
45681 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
45682
45683 // Madd vector size is half of the original vector size
45684 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45685 ArrayRef<SDValue> Ops) {
45686 MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
45687 return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
45688 };
45689 SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
45690 PMADDWDBuilder);
45691 // Fill the rest of the output with 0
45692 SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
45693 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
45694
45695 // Preserve the reduction flag on the ADD. We may need to revisit for the
45696 // other operand.
45697 SDNodeFlags Flags;
45698 Flags.setVectorReduction(true);
45699 return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
45700}
45701
45702static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
45703 const X86Subtarget &Subtarget) {
45704 if (!Subtarget.hasSSE2())
45705 return SDValue();
45706
45707 SDLoc DL(N);
45708 EVT VT = N->getValueType(0);
45709
45710 // TODO: There's nothing special about i32, any integer type above i16 should
45711 // work just as well.
45712 if (!VT.isVector() || !VT.isSimple() ||
45713 !(VT.getVectorElementType() == MVT::i32))
45714 return SDValue();
45715
45716 unsigned RegSize = 128;
45717 if (Subtarget.useBWIRegs())
45718 RegSize = 512;
45719 else if (Subtarget.hasAVX())
45720 RegSize = 256;
45721
45722 // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
45723 // TODO: We should be able to handle larger vectors by splitting them before
45724 // feeding them into several SADs, and then reducing over those.
45725 if (VT.getSizeInBits() / 4 > RegSize)
45726 return SDValue();
45727
45728 // We know N is a reduction add. To match SAD, we need one of the operands to
45729 // be an ABS.
45730 SDValue AbsOp = N->getOperand(0);
45731 SDValue OtherOp = N->getOperand(1);
45732 if (AbsOp.getOpcode() != ISD::ABS)
45733 std::swap(AbsOp, OtherOp);
45734 if (AbsOp.getOpcode() != ISD::ABS)
45735 return SDValue();
45736
45737 // Check whether we have an abs-diff pattern feeding into the select.
45738 SDValue SadOp0, SadOp1;
45739 if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
45740 return SDValue();
45741
45742 // SAD pattern detected. Now build a SAD instruction and an addition for
45743 // reduction. Note that the number of elements of the result of SAD is less
45744 // than the number of elements of its input. Therefore, we could only update
45745 // part of elements in the reduction vector.
45746 SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
45747
45748 // The output of PSADBW is a vector of i64.
45749 // We need to turn the vector of i64 into a vector of i32.
45750 // If the reduction vector is at least as wide as the psadbw result, just
45751 // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
45752 // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
45753 // result to v2i32 which will be removed by type legalization. If we/ widen
45754 // narrow vectors then we bitcast to v4i32 and extract v2i32.
45755 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
45756 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
45757
45758 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
45759 // Fill the upper elements with zero to match the add width.
45760 assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs")((VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs"
) ? static_cast<void> (0) : __assert_fail ("VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && \"Unexpected VTs\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45760, __PRETTY_FUNCTION__))
;
45761 unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
45762 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
45763 Ops[0] = Sad;
45764 Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
45765 } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
45766 Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
45767 DAG.getIntPtrConstant(0, DL));
45768 }
45769
45770 // Preserve the reduction flag on the ADD. We may need to revisit for the
45771 // other operand.
45772 SDNodeFlags Flags;
45773 Flags.setVectorReduction(true);
45774 return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
45775}
45776
45777static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
45778 const SDLoc &DL, EVT VT,
45779 const X86Subtarget &Subtarget) {
45780 // Example of pattern we try to detect:
45781 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
45782 //(add (build_vector (extract_elt t, 0),
45783 // (extract_elt t, 2),
45784 // (extract_elt t, 4),
45785 // (extract_elt t, 6)),
45786 // (build_vector (extract_elt t, 1),
45787 // (extract_elt t, 3),
45788 // (extract_elt t, 5),
45789 // (extract_elt t, 7)))
45790
45791 if (!Subtarget.hasSSE2())
45792 return SDValue();
45793
45794 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
45795 Op1.getOpcode() != ISD::BUILD_VECTOR)
45796 return SDValue();
45797
45798 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
45799 VT.getVectorNumElements() < 4 ||
45800 !isPowerOf2_32(VT.getVectorNumElements()))
45801 return SDValue();
45802
45803 // Check if one of Op0,Op1 is of the form:
45804 // (build_vector (extract_elt Mul, 0),
45805 // (extract_elt Mul, 2),
45806 // (extract_elt Mul, 4),
45807 // ...
45808 // the other is of the form:
45809 // (build_vector (extract_elt Mul, 1),
45810 // (extract_elt Mul, 3),
45811 // (extract_elt Mul, 5),
45812 // ...
45813 // and identify Mul.
45814 SDValue Mul;
45815 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
45816 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
45817 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
45818 // TODO: Be more tolerant to undefs.
45819 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45820 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45821 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45822 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
45823 return SDValue();
45824 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
45825 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
45826 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
45827 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
45828 if (!Const0L || !Const1L || !Const0H || !Const1H)
45829 return SDValue();
45830 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
45831 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
45832 // Commutativity of mul allows factors of a product to reorder.
45833 if (Idx0L > Idx1L)
45834 std::swap(Idx0L, Idx1L);
45835 if (Idx0H > Idx1H)
45836 std::swap(Idx0H, Idx1H);
45837 // Commutativity of add allows pairs of factors to reorder.
45838 if (Idx0L > Idx0H) {
45839 std::swap(Idx0L, Idx0H);
45840 std::swap(Idx1L, Idx1H);
45841 }
45842 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
45843 Idx1H != 2 * i + 3)
45844 return SDValue();
45845 if (!Mul) {
45846 // First time an extract_elt's source vector is visited. Must be a MUL
45847 // with 2X number of vector elements than the BUILD_VECTOR.
45848 // Both extracts must be from same MUL.
45849 Mul = Op0L->getOperand(0);
45850 if (Mul->getOpcode() != ISD::MUL ||
45851 Mul.getValueType().getVectorNumElements() != 2 * e)
45852 return SDValue();
45853 }
45854 // Check that the extract is from the same MUL previously seen.
45855 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
45856 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
45857 return SDValue();
45858 }
45859
45860 // Check if the Mul source can be safely shrunk.
45861 ShrinkMode Mode;
45862 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
45863 Mode == ShrinkMode::MULU16)
45864 return SDValue();
45865
45866 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45867 ArrayRef<SDValue> Ops) {
45868 // Shrink by adding truncate nodes and let DAGCombine fold with the
45869 // sources.
45870 EVT InVT = Ops[0].getValueType();
45871 assert(InVT.getScalarType() == MVT::i32 &&((InVT.getScalarType() == MVT::i32 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i32 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45872, __PRETTY_FUNCTION__))
45872 "Unexpected scalar element type")((InVT.getScalarType() == MVT::i32 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("InVT.getScalarType() == MVT::i32 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45872, __PRETTY_FUNCTION__))
;
45873 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")((InVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45873, __PRETTY_FUNCTION__))
;
45874 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
45875 InVT.getVectorNumElements() / 2);
45876 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
45877 InVT.getVectorNumElements());
45878 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
45879 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
45880 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
45881 };
45882 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
45883 { Mul.getOperand(0), Mul.getOperand(1) },
45884 PMADDBuilder);
45885}
45886
45887// Attempt to turn this pattern into PMADDWD.
45888// (mul (add (sext (build_vector)), (sext (build_vector))),
45889// (add (sext (build_vector)), (sext (build_vector)))
45890static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
45891 const SDLoc &DL, EVT VT,
45892 const X86Subtarget &Subtarget) {
45893 if (!Subtarget.hasSSE2())
45894 return SDValue();
45895
45896 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
45897 return SDValue();
45898
45899 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
45900 VT.getVectorNumElements() < 4 ||
45901 !isPowerOf2_32(VT.getVectorNumElements()))
45902 return SDValue();
45903
45904 SDValue N00 = N0.getOperand(0);
45905 SDValue N01 = N0.getOperand(1);
45906 SDValue N10 = N1.getOperand(0);
45907 SDValue N11 = N1.getOperand(1);
45908
45909 // All inputs need to be sign extends.
45910 // TODO: Support ZERO_EXTEND from known positive?
45911 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
45912 N01.getOpcode() != ISD::SIGN_EXTEND ||
45913 N10.getOpcode() != ISD::SIGN_EXTEND ||
45914 N11.getOpcode() != ISD::SIGN_EXTEND)
45915 return SDValue();
45916
45917 // Peek through the extends.
45918 N00 = N00.getOperand(0);
45919 N01 = N01.getOperand(0);
45920 N10 = N10.getOperand(0);
45921 N11 = N11.getOperand(0);
45922
45923 // Must be extending from vXi16.
45924 EVT InVT = N00.getValueType();
45925 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
45926 N10.getValueType() != InVT || N11.getValueType() != InVT)
45927 return SDValue();
45928
45929 // All inputs should be build_vectors.
45930 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
45931 N01.getOpcode() != ISD::BUILD_VECTOR ||
45932 N10.getOpcode() != ISD::BUILD_VECTOR ||
45933 N11.getOpcode() != ISD::BUILD_VECTOR)
45934 return SDValue();
45935
45936 // For each element, we need to ensure we have an odd element from one vector
45937 // multiplied by the odd element of another vector and the even element from
45938 // one of the same vectors being multiplied by the even element from the
45939 // other vector. So we need to make sure for each element i, this operator
45940 // is being performed:
45941 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
45942 SDValue In0, In1;
45943 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
45944 SDValue N00Elt = N00.getOperand(i);
45945 SDValue N01Elt = N01.getOperand(i);
45946 SDValue N10Elt = N10.getOperand(i);
45947 SDValue N11Elt = N11.getOperand(i);
45948 // TODO: Be more tolerant to undefs.
45949 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45950 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45951 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
45952 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
45953 return SDValue();
45954 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
45955 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
45956 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
45957 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
45958 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
45959 return SDValue();
45960 unsigned IdxN00 = ConstN00Elt->getZExtValue();
45961 unsigned IdxN01 = ConstN01Elt->getZExtValue();
45962 unsigned IdxN10 = ConstN10Elt->getZExtValue();
45963 unsigned IdxN11 = ConstN11Elt->getZExtValue();
45964 // Add is commutative so indices can be reordered.
45965 if (IdxN00 > IdxN10) {
45966 std::swap(IdxN00, IdxN10);
45967 std::swap(IdxN01, IdxN11);
45968 }
45969 // N0 indices be the even element. N1 indices must be the next odd element.
45970 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
45971 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
45972 return SDValue();
45973 SDValue N00In = N00Elt.getOperand(0);
45974 SDValue N01In = N01Elt.getOperand(0);
45975 SDValue N10In = N10Elt.getOperand(0);
45976 SDValue N11In = N11Elt.getOperand(0);
45977 // First time we find an input capture it.
45978 if (!In0) {
45979 In0 = N00In;
45980 In1 = N01In;
45981 }
45982 // Mul is commutative so the input vectors can be in any order.
45983 // Canonicalize to make the compares easier.
45984 if (In0 != N00In)
45985 std::swap(N00In, N01In);
45986 if (In0 != N10In)
45987 std::swap(N10In, N11In);
45988 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
45989 return SDValue();
45990 }
45991
45992 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
45993 ArrayRef<SDValue> Ops) {
45994 // Shrink by adding truncate nodes and let DAGCombine fold with the
45995 // sources.
45996 EVT OpVT = Ops[0].getValueType();
45997 assert(OpVT.getScalarType() == MVT::i16 &&((OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45998, __PRETTY_FUNCTION__))
45998 "Unexpected scalar element type")((OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type"
) ? static_cast<void> (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45998, __PRETTY_FUNCTION__))
;
45999 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")((OpVT == Ops[1].getValueType() && "Operands' types mismatch"
) ? static_cast<void> (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 45999, __PRETTY_FUNCTION__))
;
46000 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
46001 OpVT.getVectorNumElements() / 2);
46002 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
46003 };
46004 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
46005 PMADDBuilder);
46006}
46007
46008static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
46009 TargetLowering::DAGCombinerInfo &DCI,
46010 const X86Subtarget &Subtarget) {
46011 const SDNodeFlags Flags = N->getFlags();
46012 if (Flags.hasVectorReduction()) {
46013 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
46014 return Sad;
46015 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
46016 return MAdd;
46017 }
46018 EVT VT = N->getValueType(0);
46019 SDValue Op0 = N->getOperand(0);
46020 SDValue Op1 = N->getOperand(1);
46021
46022 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
46023 return MAdd;
46024 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
46025 return MAdd;
46026
46027 // Try to synthesize horizontal adds from adds of shuffles.
46028 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
46029 VT == MVT::v8i32) &&
46030 Subtarget.hasSSSE3() &&
46031 isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) {
46032 auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46033 ArrayRef<SDValue> Ops) {
46034 return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
46035 };
46036 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
46037 HADDBuilder);
46038 }
46039
46040 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
46041 // (sub Y, (sext (vXi1 X))).
46042 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
46043 // generic DAG combine without a legal type check, but adding this there
46044 // caused regressions.
46045 if (VT.isVector()) {
46046 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46047 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
46048 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
46049 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
46050 SDLoc DL(N);
46051 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
46052 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
46053 }
46054
46055 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
46056 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
46057 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
46058 SDLoc DL(N);
46059 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
46060 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
46061 }
46062 }
46063
46064 return combineAddOrSubToADCOrSBB(N, DAG);
46065}
46066
46067static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
46068 const X86Subtarget &Subtarget) {
46069 SDValue Op0 = N->getOperand(0);
46070 SDValue Op1 = N->getOperand(1);
46071 EVT VT = N->getValueType(0);
46072
46073 if (!VT.isVector())
46074 return SDValue();
46075
46076 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
46077 // is only worth it with SSSE3 (PSHUFB).
46078 EVT EltVT = VT.getVectorElementType();
46079 if (!(Subtarget.hasSSE2() && (EltVT == MVT::i8 || EltVT == MVT::i16)) &&
46080 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
46081 !(Subtarget.useBWIRegs() && (VT == MVT::v16i32)))
46082 return SDValue();
46083
46084 SDValue SubusLHS, SubusRHS;
46085 // Try to find umax(a,b) - b or a - umin(a,b) patterns
46086 // they may be converted to subus(a,b).
46087 // TODO: Need to add IR canonicalization for this code.
46088 if (Op0.getOpcode() == ISD::UMAX) {
46089 SubusRHS = Op1;
46090 SDValue MaxLHS = Op0.getOperand(0);
46091 SDValue MaxRHS = Op0.getOperand(1);
46092 if (MaxLHS == Op1)
46093 SubusLHS = MaxRHS;
46094 else if (MaxRHS == Op1)
46095 SubusLHS = MaxLHS;
46096 else
46097 return SDValue();
46098 } else if (Op1.getOpcode() == ISD::UMIN) {
46099 SubusLHS = Op0;
46100 SDValue MinLHS = Op1.getOperand(0);
46101 SDValue MinRHS = Op1.getOperand(1);
46102 if (MinLHS == Op0)
46103 SubusRHS = MinRHS;
46104 else if (MinRHS == Op0)
46105 SubusRHS = MinLHS;
46106 else
46107 return SDValue();
46108 } else
46109 return SDValue();
46110
46111 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
46112 // special preprocessing in some cases.
46113 if (EltVT == MVT::i8 || EltVT == MVT::i16)
46114 return DAG.getNode(ISD::USUBSAT, SDLoc(N), VT, SubusLHS, SubusRHS);
46115
46116 assert((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) &&(((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64)
&& "Unexpected VT!") ? static_cast<void> (0) :
__assert_fail ("(VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46117, __PRETTY_FUNCTION__))
46117 "Unexpected VT!")(((VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64)
&& "Unexpected VT!") ? static_cast<void> (0) :
__assert_fail ("(VT == MVT::v8i32 || VT == MVT::v16i32 || VT == MVT::v8i64) && \"Unexpected VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46117, __PRETTY_FUNCTION__))
;
46118
46119 // Special preprocessing case can be only applied
46120 // if the value was zero extended from 16 bit,
46121 // so we require first 16 bits to be zeros for 32 bit
46122 // values, or first 48 bits for 64 bit values.
46123 KnownBits Known = DAG.computeKnownBits(SubusLHS);
46124 unsigned NumZeros = Known.countMinLeadingZeros();
46125 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
46126 return SDValue();
46127
46128 EVT ExtType = SubusLHS.getValueType();
46129 EVT ShrinkedType;
46130 if (VT == MVT::v8i32 || VT == MVT::v8i64)
46131 ShrinkedType = MVT::v8i16;
46132 else
46133 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
46134
46135 // If SubusLHS is zeroextended - truncate SubusRHS to it's
46136 // size SubusRHS = umin(0xFFF.., SubusRHS).
46137 SDValue SaturationConst =
46138 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
46139 ShrinkedType.getScalarSizeInBits()),
46140 SDLoc(SubusLHS), ExtType);
46141 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
46142 SaturationConst);
46143 SDValue NewSubusLHS =
46144 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
46145 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
46146 SDValue Psubus = DAG.getNode(ISD::USUBSAT, SDLoc(N), ShrinkedType,
46147 NewSubusLHS, NewSubusRHS);
46148
46149 // Zero extend the result, it may be used somewhere as 32 bit,
46150 // if not zext and following trunc will shrink.
46151 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
46152}
46153
46154static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
46155 TargetLowering::DAGCombinerInfo &DCI,
46156 const X86Subtarget &Subtarget) {
46157 SDValue Op0 = N->getOperand(0);
46158 SDValue Op1 = N->getOperand(1);
46159
46160 // X86 can't encode an immediate LHS of a sub. See if we can push the
46161 // negation into a preceding instruction.
46162 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
46163 // If the RHS of the sub is a XOR with one use and a constant, invert the
46164 // immediate. Then add one to the LHS of the sub so we can turn
46165 // X-Y -> X+~Y+1, saving one register.
46166 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
46167 isa<ConstantSDNode>(Op1.getOperand(1))) {
46168 const APInt &XorC = Op1.getConstantOperandAPInt(1);
46169 EVT VT = Op0.getValueType();
46170 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
46171 Op1.getOperand(0),
46172 DAG.getConstant(~XorC, SDLoc(Op1), VT));
46173 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
46174 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
46175 }
46176 }
46177
46178 // Try to synthesize horizontal subs from subs of shuffles.
46179 EVT VT = N->getValueType(0);
46180 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
46181 VT == MVT::v8i32) &&
46182 Subtarget.hasSSSE3() &&
46183 isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) {
46184 auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46185 ArrayRef<SDValue> Ops) {
46186 return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
46187 };
46188 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
46189 HSUBBuilder);
46190 }
46191
46192 // Try to create PSUBUS if SUB's argument is max/min
46193 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
46194 return V;
46195
46196 return combineAddOrSubToADCOrSBB(N, DAG);
46197}
46198
46199static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
46200 const X86Subtarget &Subtarget) {
46201 MVT VT = N->getSimpleValueType(0);
46202 SDLoc DL(N);
46203
46204 if (N->getOperand(0) == N->getOperand(1)) {
46205 if (N->getOpcode() == X86ISD::PCMPEQ)
46206 return DAG.getConstant(-1, DL, VT);
46207 if (N->getOpcode() == X86ISD::PCMPGT)
46208 return DAG.getConstant(0, DL, VT);
46209 }
46210
46211 return SDValue();
46212}
46213
46214/// Helper that combines an array of subvector ops as if they were the operands
46215/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
46216/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
46217static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
46218 ArrayRef<SDValue> Ops, SelectionDAG &DAG,
46219 TargetLowering::DAGCombinerInfo &DCI,
46220 const X86Subtarget &Subtarget) {
46221 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")((Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? static_cast<void> (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46221, __PRETTY_FUNCTION__))
;
46222 unsigned EltSizeInBits = VT.getScalarSizeInBits();
46223
46224 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
46225 return DAG.getUNDEF(VT);
46226
46227 if (llvm::all_of(Ops, [](SDValue Op) {
46228 return ISD::isBuildVectorAllZeros(Op.getNode());
46229 }))
46230 return getZeroVector(VT, Subtarget, DAG, DL);
46231
46232 SDValue Op0 = Ops[0];
46233 bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
46234
46235 // Fold subvector loads into one.
46236 // If needed, look through bitcasts to get to the load.
46237 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
46238 bool Fast;
46239 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
46240 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
46241 *FirstLd->getMemOperand(), &Fast) &&
46242 Fast) {
46243 if (SDValue Ld =
46244 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
46245 return Ld;
46246 }
46247 }
46248
46249 // Repeated subvectors.
46250 if (IsSplat) {
46251 // If this broadcast/subv_broadcast is inserted into both halves, use a
46252 // larger broadcast/subv_broadcast.
46253 if (Op0.getOpcode() == X86ISD::VBROADCAST ||
46254 Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
46255 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
46256
46257 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
46258 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
46259 (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
46260 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
46261 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
46262 Op0.getOperand(0),
46263 DAG.getIntPtrConstant(0, DL)));
46264
46265 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
46266 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
46267 (Subtarget.hasAVX2() ||
46268 (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
46269 Op0.getOperand(0).getValueType() == VT.getScalarType())
46270 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
46271 }
46272
46273 // Repeated opcode.
46274 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
46275 // but it currently struggles with different vector widths.
46276 if (llvm::all_of(Ops, [Op0](SDValue Op) {
46277 return Op.getOpcode() == Op0.getOpcode();
46278 })) {
46279 unsigned NumOps = Ops.size();
46280 switch (Op0.getOpcode()) {
46281 case X86ISD::PSHUFHW:
46282 case X86ISD::PSHUFLW:
46283 case X86ISD::PSHUFD:
46284 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
46285 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
46286 SmallVector<SDValue, 2> Src;
46287 for (unsigned i = 0; i != NumOps; ++i)
46288 Src.push_back(Ops[i].getOperand(0));
46289 return DAG.getNode(Op0.getOpcode(), DL, VT,
46290 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
46291 Op0.getOperand(1));
46292 }
46293 LLVM_FALLTHROUGH[[gnu::fallthrough]];
46294 case X86ISD::VPERMILPI:
46295 // TODO - add support for vXf64/vXi64 shuffles.
46296 if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
46297 Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
46298 SmallVector<SDValue, 2> Src;
46299 for (unsigned i = 0; i != NumOps; ++i)
46300 Src.push_back(DAG.getBitcast(MVT::v4f32, Ops[i].getOperand(0)));
46301 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f32, Src);
46302 Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
46303 Op0.getOperand(1));
46304 return DAG.getBitcast(VT, Res);
46305 }
46306 break;
46307 case X86ISD::VSHLI:
46308 case X86ISD::VSRAI:
46309 case X86ISD::VSRLI:
46310 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
46311 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
46312 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
46313 llvm::all_of(Ops, [Op0](SDValue Op) {
46314 return Op0.getOperand(1) == Op.getOperand(1);
46315 })) {
46316 SmallVector<SDValue, 2> Src;
46317 for (unsigned i = 0; i != NumOps; ++i)
46318 Src.push_back(Ops[i].getOperand(0));
46319 return DAG.getNode(Op0.getOpcode(), DL, VT,
46320 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
46321 Op0.getOperand(1));
46322 }
46323 break;
46324 case X86ISD::VPERMI:
46325 case X86ISD::VROTLI:
46326 case X86ISD::VROTRI:
46327 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
46328 llvm::all_of(Ops, [Op0](SDValue Op) {
46329 return Op0.getOperand(1) == Op.getOperand(1);
46330 })) {
46331 SmallVector<SDValue, 2> Src;
46332 for (unsigned i = 0; i != NumOps; ++i)
46333 Src.push_back(Ops[i].getOperand(0));
46334 return DAG.getNode(Op0.getOpcode(), DL, VT,
46335 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
46336 Op0.getOperand(1));
46337 }
46338 break;
46339 case X86ISD::PACKSS:
46340 case X86ISD::PACKUS:
46341 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
46342 Subtarget.hasInt256()) {
46343 SmallVector<SDValue, 2> LHS, RHS;
46344 for (unsigned i = 0; i != NumOps; ++i) {
46345 LHS.push_back(Ops[i].getOperand(0));
46346 RHS.push_back(Ops[i].getOperand(1));
46347 }
46348 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
46349 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
46350 NumOps * SrcVT.getVectorNumElements());
46351 return DAG.getNode(Op0.getOpcode(), DL, VT,
46352 DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, LHS),
46353 DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
46354 }
46355 break;
46356 }
46357 }
46358
46359 return SDValue();
46360}
46361
46362static SDValue combineConcatVectors(SDNode *N, SelectionDAG &DAG,
46363 TargetLowering::DAGCombinerInfo &DCI,
46364 const X86Subtarget &Subtarget) {
46365 EVT VT = N->getValueType(0);
46366 EVT SrcVT = N->getOperand(0).getValueType();
46367 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46368
46369 // Don't do anything for i1 vectors.
46370 if (VT.getVectorElementType() == MVT::i1)
46371 return SDValue();
46372
46373 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
46374 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
46375 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
46376 DCI, Subtarget))
46377 return R;
46378 }
46379
46380 return SDValue();
46381}
46382
46383static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
46384 TargetLowering::DAGCombinerInfo &DCI,
46385 const X86Subtarget &Subtarget) {
46386 if (DCI.isBeforeLegalizeOps())
46387 return SDValue();
46388
46389 MVT OpVT = N->getSimpleValueType(0);
46390
46391 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
46392
46393 SDLoc dl(N);
46394 SDValue Vec = N->getOperand(0);
46395 SDValue SubVec = N->getOperand(1);
46396
46397 uint64_t IdxVal = N->getConstantOperandVal(2);
46398 MVT SubVecVT = SubVec.getSimpleValueType();
46399
46400 if (Vec.isUndef() && SubVec.isUndef())
46401 return DAG.getUNDEF(OpVT);
46402
46403 // Inserting undefs/zeros into zeros/undefs is a zero vector.
46404 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
46405 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
46406 return getZeroVector(OpVT, Subtarget, DAG, dl);
46407
46408 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
46409 // If we're inserting into a zero vector and then into a larger zero vector,
46410 // just insert into the larger zero vector directly.
46411 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
46412 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
46413 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
46414 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
46415 getZeroVector(OpVT, Subtarget, DAG, dl),
46416 SubVec.getOperand(1),
46417 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
46418 }
46419
46420 // If we're inserting into a zero vector and our input was extracted from an
46421 // insert into a zero vector of the same type and the extraction was at
46422 // least as large as the original insertion. Just insert the original
46423 // subvector into a zero vector.
46424 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
46425 isNullConstant(SubVec.getOperand(1)) &&
46426 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
46427 SDValue Ins = SubVec.getOperand(0);
46428 if (isNullConstant(Ins.getOperand(2)) &&
46429 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
46430 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
46431 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
46432 getZeroVector(OpVT, Subtarget, DAG, dl),
46433 Ins.getOperand(1), N->getOperand(2));
46434 }
46435 }
46436
46437 // Stop here if this is an i1 vector.
46438 if (IsI1Vector)
46439 return SDValue();
46440
46441 // If this is an insert of an extract, combine to a shuffle. Don't do this
46442 // if the insert or extract can be represented with a subregister operation.
46443 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
46444 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
46445 (IdxVal != 0 || !Vec.isUndef())) {
46446 int ExtIdxVal = SubVec.getConstantOperandVal(1);
46447 if (ExtIdxVal != 0) {
46448 int VecNumElts = OpVT.getVectorNumElements();
46449 int SubVecNumElts = SubVecVT.getVectorNumElements();
46450 SmallVector<int, 64> Mask(VecNumElts);
46451 // First create an identity shuffle mask.
46452 for (int i = 0; i != VecNumElts; ++i)
46453 Mask[i] = i;
46454 // Now insert the extracted portion.
46455 for (int i = 0; i != SubVecNumElts; ++i)
46456 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
46457
46458 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
46459 }
46460 }
46461
46462 // Match concat_vector style patterns.
46463 SmallVector<SDValue, 2> SubVectorOps;
46464 if (collectConcatOps(N, SubVectorOps)) {
46465 if (SDValue Fold =
46466 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
46467 return Fold;
46468
46469 // If we're inserting all zeros into the upper half, change this to
46470 // a concat with zero. We will match this to a move
46471 // with implicit upper bit zeroing during isel.
46472 // We do this here because we don't want combineConcatVectorOps to
46473 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
46474 if (SubVectorOps.size() == 2 &&
46475 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
46476 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
46477 getZeroVector(OpVT, Subtarget, DAG, dl),
46478 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
46479 }
46480
46481 // If this is a broadcast insert into an upper undef, use a larger broadcast.
46482 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
46483 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
46484
46485 // If this is a broadcast load inserted into an upper undef, use a larger
46486 // broadcast load.
46487 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
46488 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
46489 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
46490 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
46491 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
46492 SDValue BcastLd =
46493 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
46494 MemIntr->getMemoryVT(),
46495 MemIntr->getMemOperand());
46496 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
46497 return BcastLd;
46498 }
46499
46500 return SDValue();
46501}
46502
46503/// If we are extracting a subvector of a vector select and the select condition
46504/// is composed of concatenated vectors, try to narrow the select width. This
46505/// is a common pattern for AVX1 integer code because 256-bit selects may be
46506/// legal, but there is almost no integer math/logic available for 256-bit.
46507/// This function should only be called with legal types (otherwise, the calls
46508/// to get simple value types will assert).
46509static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
46510 SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
46511 SmallVector<SDValue, 4> CatOps;
46512 if (Sel.getOpcode() != ISD::VSELECT ||
46513 !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
46514 return SDValue();
46515
46516 // Note: We assume simple value types because this should only be called with
46517 // legal operations/types.
46518 // TODO: This can be extended to handle extraction to 256-bits.
46519 MVT VT = Ext->getSimpleValueType(0);
46520 if (!VT.is128BitVector())
46521 return SDValue();
46522
46523 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
46524 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
46525 return SDValue();
46526
46527 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
46528 MVT SelVT = Sel.getSimpleValueType();
46529 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations") ? static_cast
<void> (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46530, __PRETTY_FUNCTION__))
46530 "Unexpected vector type with legal operations")(((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
"Unexpected vector type with legal operations") ? static_cast
<void> (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46530, __PRETTY_FUNCTION__))
;
46531
46532 unsigned SelElts = SelVT.getVectorNumElements();
46533 unsigned CastedElts = WideVT.getVectorNumElements();
46534 unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
46535 if (SelElts % CastedElts == 0) {
46536 // The select has the same or more (narrower) elements than the extract
46537 // operand. The extraction index gets scaled by that factor.
46538 ExtIdx *= (SelElts / CastedElts);
46539 } else if (CastedElts % SelElts == 0) {
46540 // The select has less (wider) elements than the extract operand. Make sure
46541 // that the extraction index can be divided evenly.
46542 unsigned IndexDivisor = CastedElts / SelElts;
46543 if (ExtIdx % IndexDivisor != 0)
46544 return SDValue();
46545 ExtIdx /= IndexDivisor;
46546 } else {
46547 llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46547)
;
46548 }
46549
46550 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
46551 unsigned NarrowElts = SelElts / NarrowingFactor;
46552 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
46553 SDLoc DL(Ext);
46554 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
46555 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
46556 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
46557 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
46558 return DAG.getBitcast(VT, NarrowSel);
46559}
46560
46561static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
46562 TargetLowering::DAGCombinerInfo &DCI,
46563 const X86Subtarget &Subtarget) {
46564 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
46565 // eventually get combined/lowered into ANDNP) with a concatenated operand,
46566 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
46567 // We let generic combining take over from there to simplify the
46568 // insert/extract and 'not'.
46569 // This pattern emerges during AVX1 legalization. We handle it before lowering
46570 // to avoid complications like splitting constant vector loads.
46571
46572 // Capture the original wide type in the likely case that we need to bitcast
46573 // back to this type.
46574 if (!N->getValueType(0).isSimple())
46575 return SDValue();
46576
46577 MVT VT = N->getSimpleValueType(0);
46578 SDValue InVec = N->getOperand(0);
46579 SDValue InVecBC = peekThroughBitcasts(InVec);
46580 EVT InVecVT = InVec.getValueType();
46581 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46582
46583 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
46584 TLI.isTypeLegal(InVecVT) &&
46585 InVecVT.getSizeInBits() == 256 && InVecBC.getOpcode() == ISD::AND) {
46586 auto isConcatenatedNot = [] (SDValue V) {
46587 V = peekThroughBitcasts(V);
46588 if (!isBitwiseNot(V))
46589 return false;
46590 SDValue NotOp = V->getOperand(0);
46591 return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
46592 };
46593 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
46594 isConcatenatedNot(InVecBC.getOperand(1))) {
46595 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
46596 SDValue Concat = split256IntArith(InVecBC, DAG);
46597 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
46598 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
46599 }
46600 }
46601
46602 if (DCI.isBeforeLegalizeOps())
46603 return SDValue();
46604
46605 if (SDValue V = narrowExtractedVectorSelect(N, DAG))
46606 return V;
46607
46608 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
46609
46610 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
46611 return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
46612
46613 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
46614 if (VT.getScalarType() == MVT::i1)
46615 return DAG.getConstant(1, SDLoc(N), VT);
46616 return getOnesVector(VT, DAG, SDLoc(N));
46617 }
46618
46619 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
46620 return DAG.getBuildVector(
46621 VT, SDLoc(N),
46622 InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
46623
46624 // If we are extracting from an insert into a zero vector, replace with a
46625 // smaller insert into zero if we don't access less than the original
46626 // subvector. Don't do this for i1 vectors.
46627 if (VT.getVectorElementType() != MVT::i1 &&
46628 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
46629 InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
46630 ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
46631 InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
46632 SDLoc DL(N);
46633 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
46634 getZeroVector(VT, Subtarget, DAG, DL),
46635 InVec.getOperand(1), InVec.getOperand(2));
46636 }
46637
46638 // If we're extracting from a broadcast then we're better off just
46639 // broadcasting to the smaller type directly, assuming this is the only use.
46640 // As its a broadcast we don't care about the extraction index.
46641 if (InVec.getOpcode() == X86ISD::VBROADCAST && InVec.hasOneUse() &&
46642 InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits())
46643 return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0));
46644
46645 if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) {
46646 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
46647 if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) {
46648 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
46649 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
46650 SDValue BcastLd =
46651 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
46652 MemIntr->getMemoryVT(),
46653 MemIntr->getMemOperand());
46654 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
46655 return BcastLd;
46656 }
46657 }
46658
46659 // If we're extracting the lowest subvector and we're the only user,
46660 // we may be able to perform this with a smaller vector width.
46661 if (IdxVal == 0 && InVec.hasOneUse()) {
46662 unsigned InOpcode = InVec.getOpcode();
46663 if (VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
46664 // v2f64 CVTDQ2PD(v4i32).
46665 if (InOpcode == ISD::SINT_TO_FP &&
46666 InVec.getOperand(0).getValueType() == MVT::v4i32) {
46667 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
46668 }
46669 // v2f64 CVTUDQ2PD(v4i32).
46670 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
46671 InVec.getOperand(0).getValueType() == MVT::v4i32) {
46672 return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
46673 }
46674 // v2f64 CVTPS2PD(v4f32).
46675 if (InOpcode == ISD::FP_EXTEND &&
46676 InVec.getOperand(0).getValueType() == MVT::v4f32) {
46677 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));
46678 }
46679 }
46680 if ((InOpcode == ISD::ANY_EXTEND ||
46681 InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||
46682 InOpcode == ISD::ZERO_EXTEND ||
46683 InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
46684 InOpcode == ISD::SIGN_EXTEND ||
46685 InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
46686 VT.is128BitVector() &&
46687 InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
46688 unsigned ExtOp = getOpcode_EXTEND_VECTOR_INREG(InOpcode);
46689 return DAG.getNode(ExtOp, SDLoc(N), VT, InVec.getOperand(0));
46690 }
46691 if (InOpcode == ISD::VSELECT &&
46692 InVec.getOperand(0).getValueType().is256BitVector() &&
46693 InVec.getOperand(1).getValueType().is256BitVector() &&
46694 InVec.getOperand(2).getValueType().is256BitVector()) {
46695 SDLoc DL(N);
46696 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
46697 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
46698 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
46699 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
46700 }
46701 }
46702
46703 return SDValue();
46704}
46705
46706static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
46707 EVT VT = N->getValueType(0);
46708 SDValue Src = N->getOperand(0);
46709 SDLoc DL(N);
46710
46711 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
46712 // This occurs frequently in our masked scalar intrinsic code and our
46713 // floating point select lowering with AVX512.
46714 // TODO: SimplifyDemandedBits instead?
46715 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
46716 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
46717 if (C->getAPIntValue().isOneValue())
46718 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,
46719 Src.getOperand(0));
46720
46721 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
46722 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46723 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
46724 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
46725 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
46726 if (C->isNullValue())
46727 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
46728 Src.getOperand(1));
46729
46730 // Reduce v2i64 to v4i32 if we don't need the upper bits.
46731 // TODO: Move to DAGCombine?
46732 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
46733 Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
46734 Src.getOperand(0).getScalarValueSizeInBits() <= 32)
46735 return DAG.getBitcast(
46736 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
46737 DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
46738
46739 return SDValue();
46740}
46741
46742// Simplify PMULDQ and PMULUDQ operations.
46743static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
46744 TargetLowering::DAGCombinerInfo &DCI,
46745 const X86Subtarget &Subtarget) {
46746 SDValue LHS = N->getOperand(0);
46747 SDValue RHS = N->getOperand(1);
46748
46749 // Canonicalize constant to RHS.
46750 if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
46751 !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
46752 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
46753
46754 // Multiply by zero.
46755 // Don't return RHS as it may contain UNDEFs.
46756 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
46757 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
46758
46759 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
46760 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46761 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
46762 return SDValue(N, 0);
46763
46764 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
46765 // convert it to any_extend_invec, due to the LegalOperations check, do the
46766 // conversion directly to a vector shuffle manually. This exposes combine
46767 // opportunities missed by combineExtInVec not calling
46768 // combineX86ShufflesRecursively on SSE4.1 targets.
46769 // FIXME: This is basically a hack around several other issues related to
46770 // ANY_EXTEND_VECTOR_INREG.
46771 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
46772 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
46773 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
46774 LHS.getOperand(0).getValueType() == MVT::v4i32) {
46775 SDLoc dl(N);
46776 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
46777 LHS.getOperand(0), { 0, -1, 1, -1 });
46778 LHS = DAG.getBitcast(MVT::v2i64, LHS);
46779 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
46780 }
46781 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
46782 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
46783 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
46784 RHS.getOperand(0).getValueType() == MVT::v4i32) {
46785 SDLoc dl(N);
46786 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
46787 RHS.getOperand(0), { 0, -1, 1, -1 });
46788 RHS = DAG.getBitcast(MVT::v2i64, RHS);
46789 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
46790 }
46791
46792 return SDValue();
46793}
46794
46795static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
46796 TargetLowering::DAGCombinerInfo &DCI,
46797 const X86Subtarget &Subtarget) {
46798 EVT VT = N->getValueType(0);
46799 SDValue In = N->getOperand(0);
46800 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46801
46802 // Try to merge vector loads and extend_inreg to an extload.
46803 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
46804 In.hasOneUse()) {
46805 auto *Ld = cast<LoadSDNode>(In);
46806 if (Ld->isSimple()) {
46807 MVT SVT = In.getSimpleValueType().getVectorElementType();
46808 ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
46809 EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
46810 VT.getVectorNumElements());
46811 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
46812 SDValue Load =
46813 DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
46814 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
46815 Ld->getMemOperand()->getFlags());
46816 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
46817 return Load;
46818 }
46819 }
46820 }
46821
46822 // Attempt to combine as a shuffle.
46823 // TODO: SSE41 support
46824 if (Subtarget.hasAVX() && N->getOpcode() != ISD::SIGN_EXTEND_VECTOR_INREG) {
46825 SDValue Op(N, 0);
46826 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
46827 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
46828 return Res;
46829 }
46830
46831 return SDValue();
46832}
46833
46834static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
46835 TargetLowering::DAGCombinerInfo &DCI) {
46836 EVT VT = N->getValueType(0);
46837
46838 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
46839 return DAG.getConstant(0, SDLoc(N), VT);
46840
46841 APInt KnownUndef, KnownZero;
46842 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46843 APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
46844 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
46845 KnownZero, DCI))
46846 return SDValue(N, 0);
46847
46848 return SDValue();
46849}
46850
46851// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
46852// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
46853// extra instructions between the conversion due to going to scalar and back.
46854static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
46855 const X86Subtarget &Subtarget) {
46856 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
46857 return SDValue();
46858
46859 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
46860 return SDValue();
46861
46862 if (N->getValueType(0) != MVT::f32 ||
46863 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
46864 return SDValue();
46865
46866 SDLoc dl(N);
46867 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
46868 N->getOperand(0).getOperand(0));
46869 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
46870 DAG.getTargetConstant(4, dl, MVT::i32));
46871 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
46872 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
46873 DAG.getIntPtrConstant(0, dl));
46874}
46875
46876static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
46877 const X86Subtarget &Subtarget) {
46878 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
46879 return SDValue();
46880
46881 bool IsStrict = N->isStrictFPOpcode();
46882 EVT VT = N->getValueType(0);
46883 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
46884 EVT SrcVT = Src.getValueType();
46885
46886 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
46887 return SDValue();
46888
46889 if (VT.getVectorElementType() != MVT::f32 &&
46890 VT.getVectorElementType() != MVT::f64)
46891 return SDValue();
46892
46893 unsigned NumElts = VT.getVectorNumElements();
46894 if (NumElts == 1 || !isPowerOf2_32(NumElts))
46895 return SDValue();
46896
46897 SDLoc dl(N);
46898
46899 // Convert the input to vXi16.
46900 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
46901 Src = DAG.getBitcast(IntVT, Src);
46902
46903 // Widen to at least 8 input elements.
46904 if (NumElts < 8) {
46905 unsigned NumConcats = 8 / NumElts;
46906 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
46907 : DAG.getConstant(0, dl, IntVT);
46908 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
46909 Ops[0] = Src;
46910 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
46911 }
46912
46913 // Destination is vXf32 with at least 4 elements.
46914 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
46915 std::max(4U, NumElts));
46916 SDValue Cvt, Chain;
46917 if (IsStrict) {
46918 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
46919 {N->getOperand(0), Src});
46920 Chain = Cvt.getValue(1);
46921 } else {
46922 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
46923 }
46924
46925 if (NumElts < 4) {
46926 assert(NumElts == 2 && "Unexpected size")((NumElts == 2 && "Unexpected size") ? static_cast<
void> (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 46926, __PRETTY_FUNCTION__))
;
46927 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
46928 DAG.getIntPtrConstant(0, dl));
46929 }
46930
46931 if (IsStrict) {
46932 // Extend to the original VT if necessary.
46933 if (Cvt.getValueType() != VT) {
46934 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
46935 {Chain, Cvt});
46936 Chain = Cvt.getValue(1);
46937 }
46938 return DAG.getMergeValues({Cvt, Chain}, dl);
46939 }
46940
46941 // Extend to the original VT if necessary.
46942 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
46943}
46944
46945// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
46946// cases where the loads have the same input chain and the output chains are
46947// unused. This avoids any memory ordering issues.
46948static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
46949 TargetLowering::DAGCombinerInfo &DCI) {
46950 // Only do this if the chain result is unused.
46951 if (N->hasAnyUseOfValue(1))
46952 return SDValue();
46953
46954 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
46955
46956 SDValue Ptr = MemIntrin->getBasePtr();
46957 SDValue Chain = MemIntrin->getChain();
46958 EVT VT = N->getSimpleValueType(0);
46959 EVT MemVT = MemIntrin->getMemoryVT();
46960
46961 // Look at other users of our base pointer and try to find a wider broadcast.
46962 // The input chain and the size of the memory VT must match.
46963 for (SDNode *User : Ptr->uses())
46964 if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
46965 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
46966 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
46967 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
46968 MemVT.getSizeInBits() &&
46969 !User->hasAnyUseOfValue(1) &&
46970 User->getValueSizeInBits(0) > VT.getSizeInBits()) {
46971 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
46972 VT.getSizeInBits());
46973 Extract = DAG.getBitcast(VT, Extract);
46974 return DCI.CombineTo(N, Extract, SDValue(User, 1));
46975 }
46976
46977 return SDValue();
46978}
46979
46980static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
46981 const X86Subtarget &Subtarget) {
46982 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
46983 return SDValue();
46984
46985 EVT VT = N->getValueType(0);
46986 SDValue Src = N->getOperand(0);
46987 EVT SrcVT = Src.getValueType();
46988
46989 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
46990 SrcVT.getVectorElementType() != MVT::f32)
46991 return SDValue();
46992
46993 unsigned NumElts = VT.getVectorNumElements();
46994 if (NumElts == 1 || !isPowerOf2_32(NumElts))
46995 return SDValue();
46996
46997 SDLoc dl(N);
46998
46999 // Widen to at least 4 input elements.
47000 if (NumElts < 4)
47001 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
47002 DAG.getConstantFP(0.0, dl, SrcVT));
47003
47004 // Destination is v8i16 with at least 8 elements.
47005 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
47006 std::max(8U, NumElts));
47007 SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
47008 DAG.getTargetConstant(4, dl, MVT::i32));
47009
47010 // Extract down to real number of elements.
47011 if (NumElts < 8) {
47012 EVT IntVT = VT.changeVectorElementTypeToInteger();
47013 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
47014 DAG.getIntPtrConstant(0, dl));
47015 }
47016
47017 return DAG.getBitcast(VT, Cvt);
47018}
47019
47020SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
47021 DAGCombinerInfo &DCI) const {
47022 SelectionDAG &DAG = DCI.DAG;
47023 switch (N->getOpcode()) {
47024 default: break;
47025 case ISD::SCALAR_TO_VECTOR:
47026 return combineScalarToVector(N, DAG);
47027 case ISD::EXTRACT_VECTOR_ELT:
47028 case X86ISD::PEXTRW:
47029 case X86ISD::PEXTRB:
47030 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
47031 case ISD::CONCAT_VECTORS:
47032 return combineConcatVectors(N, DAG, DCI, Subtarget);
47033 case ISD::INSERT_SUBVECTOR:
47034 return combineInsertSubvector(N, DAG, DCI, Subtarget);
47035 case ISD::EXTRACT_SUBVECTOR:
47036 return combineExtractSubvector(N, DAG, DCI, Subtarget);
47037 case ISD::VSELECT:
47038 case ISD::SELECT:
47039 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
47040 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
47041 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
47042 case X86ISD::CMP: return combineCMP(N, DAG);
47043 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
47044 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
47045 case X86ISD::ADD:
47046 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
47047 case X86ISD::SBB: return combineSBB(N, DAG);
47048 case X86ISD::ADC: return combineADC(N, DAG, DCI);
47049 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
47050 case ISD::SHL: return combineShiftLeft(N, DAG);
47051 case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
47052 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
47053 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
47054 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
47055 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
47056 case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
47057 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
47058 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
47059 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
47060 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
47061 case X86ISD::VEXTRACT_STORE:
47062 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
47063 case ISD::SINT_TO_FP:
47064 case ISD::STRICT_SINT_TO_FP:
47065 return combineSIntToFP(N, DAG, DCI, Subtarget);
47066 case ISD::UINT_TO_FP:
47067 case ISD::STRICT_UINT_TO_FP:
47068 return combineUIntToFP(N, DAG, Subtarget);
47069 case ISD::FADD:
47070 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
47071 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
47072 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
47073 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG);
47074 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
47075 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
47076 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
47077 case X86ISD::FXOR:
47078 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
47079 case X86ISD::FMIN:
47080 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
47081 case ISD::FMINNUM:
47082 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
47083 case X86ISD::CVTSI2P:
47084 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
47085 case X86ISD::CVTP2SI:
47086 case X86ISD::CVTP2UI:
47087 case X86ISD::CVTTP2SI:
47088 case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
47089 case X86ISD::STRICT_CVTPH2PS:
47090 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
47091 case X86ISD::BT: return combineBT(N, DAG, DCI);
47092 case ISD::ANY_EXTEND:
47093 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
47094 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
47095 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
47096 case ISD::ANY_EXTEND_VECTOR_INREG:
47097 case ISD::SIGN_EXTEND_VECTOR_INREG:
47098 case ISD::ZERO_EXTEND_VECTOR_INREG: return combineExtInVec(N, DAG, DCI,
47099 Subtarget);
47100 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
47101 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
47102 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
47103 case X86ISD::PACKSS:
47104 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
47105 case X86ISD::VSHL:
47106 case X86ISD::VSRA:
47107 case X86ISD::VSRL:
47108 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
47109 case X86ISD::VSHLI:
47110 case X86ISD::VSRAI:
47111 case X86ISD::VSRLI:
47112 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
47113 case X86ISD::PINSRB:
47114 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
47115 case X86ISD::SHUFP: // Handle all target specific shuffles
47116 case X86ISD::INSERTPS:
47117 case X86ISD::EXTRQI:
47118 case X86ISD::INSERTQI:
47119 case X86ISD::PALIGNR:
47120 case X86ISD::VSHLDQ:
47121 case X86ISD::VSRLDQ:
47122 case X86ISD::BLENDI:
47123 case X86ISD::UNPCKH:
47124 case X86ISD::UNPCKL:
47125 case X86ISD::MOVHLPS:
47126 case X86ISD::MOVLHPS:
47127 case X86ISD::PSHUFB:
47128 case X86ISD::PSHUFD:
47129 case X86ISD::PSHUFHW:
47130 case X86ISD::PSHUFLW:
47131 case X86ISD::MOVSHDUP:
47132 case X86ISD::MOVSLDUP:
47133 case X86ISD::MOVDDUP:
47134 case X86ISD::MOVSS:
47135 case X86ISD::MOVSD:
47136 case X86ISD::VBROADCAST:
47137 case X86ISD::VPPERM:
47138 case X86ISD::VPERMI:
47139 case X86ISD::VPERMV:
47140 case X86ISD::VPERMV3:
47141 case X86ISD::VPERMIL2:
47142 case X86ISD::VPERMILPI:
47143 case X86ISD::VPERMILPV:
47144 case X86ISD::VPERM2X128:
47145 case X86ISD::SHUF128:
47146 case X86ISD::VZEXT_MOVL:
47147 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
47148 case X86ISD::FMADD_RND:
47149 case X86ISD::FMSUB:
47150 case X86ISD::STRICT_FMSUB:
47151 case X86ISD::FMSUB_RND:
47152 case X86ISD::FNMADD:
47153 case X86ISD::STRICT_FNMADD:
47154 case X86ISD::FNMADD_RND:
47155 case X86ISD::FNMSUB:
47156 case X86ISD::STRICT_FNMSUB:
47157 case X86ISD::FNMSUB_RND:
47158 case ISD::FMA:
47159 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
47160 case X86ISD::FMADDSUB_RND:
47161 case X86ISD::FMSUBADD_RND:
47162 case X86ISD::FMADDSUB:
47163 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
47164 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
47165 case X86ISD::MGATHER:
47166 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
47167 case ISD::MGATHER:
47168 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
47169 case X86ISD::PCMPEQ:
47170 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
47171 case X86ISD::PMULDQ:
47172 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
47173 case X86ISD::KSHIFTL:
47174 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
47175 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
47176 case ISD::STRICT_FP_EXTEND:
47177 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
47178 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
47179 case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
47180 }
47181
47182 return SDValue();
47183}
47184
47185bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
47186 if (!isTypeLegal(VT))
47187 return false;
47188
47189 // There are no vXi8 shifts.
47190 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
47191 return false;
47192
47193 // TODO: Almost no 8-bit ops are desirable because they have no actual
47194 // size/speed advantages vs. 32-bit ops, but they do have a major
47195 // potential disadvantage by causing partial register stalls.
47196 //
47197 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
47198 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
47199 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
47200 // check for a constant operand to the multiply.
47201 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
47202 return false;
47203
47204 // i16 instruction encodings are longer and some i16 instructions are slow,
47205 // so those are not desirable.
47206 if (VT == MVT::i16) {
47207 switch (Opc) {
47208 default:
47209 break;
47210 case ISD::LOAD:
47211 case ISD::SIGN_EXTEND:
47212 case ISD::ZERO_EXTEND:
47213 case ISD::ANY_EXTEND:
47214 case ISD::SHL:
47215 case ISD::SRA:
47216 case ISD::SRL:
47217 case ISD::SUB:
47218 case ISD::ADD:
47219 case ISD::MUL:
47220 case ISD::AND:
47221 case ISD::OR:
47222 case ISD::XOR:
47223 return false;
47224 }
47225 }
47226
47227 // Any legal type not explicitly accounted for above here is desirable.
47228 return true;
47229}
47230
47231SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
47232 SDValue Value, SDValue Addr,
47233 SelectionDAG &DAG) const {
47234 const Module *M = DAG.getMachineFunction().getMMI().getModule();
47235 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
47236 if (IsCFProtectionSupported) {
47237 // In case control-flow branch protection is enabled, we need to add
47238 // notrack prefix to the indirect branch.
47239 // In order to do that we create NT_BRIND SDNode.
47240 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
47241 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
47242 }
47243
47244 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
47245}
47246
47247bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
47248 EVT VT = Op.getValueType();
47249 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
47250 isa<ConstantSDNode>(Op.getOperand(1));
47251
47252 // i16 is legal, but undesirable since i16 instruction encodings are longer
47253 // and some i16 instructions are slow.
47254 // 8-bit multiply-by-constant can usually be expanded to something cheaper
47255 // using LEA and/or other ALU ops.
47256 if (VT != MVT::i16 && !Is8BitMulByConstant)
47257 return false;
47258
47259 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
47260 if (!Op.hasOneUse())
47261 return false;
47262 SDNode *User = *Op->use_begin();
47263 if (!ISD::isNormalStore(User))
47264 return false;
47265 auto *Ld = cast<LoadSDNode>(Load);
47266 auto *St = cast<StoreSDNode>(User);
47267 return Ld->getBasePtr() == St->getBasePtr();
47268 };
47269
47270 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
47271 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
47272 return false;
47273 if (!Op.hasOneUse())
47274 return false;
47275 SDNode *User = *Op->use_begin();
47276 if (User->getOpcode() != ISD::ATOMIC_STORE)
47277 return false;
47278 auto *Ld = cast<AtomicSDNode>(Load);
47279 auto *St = cast<AtomicSDNode>(User);
47280 return Ld->getBasePtr() == St->getBasePtr();
47281 };
47282
47283 bool Commute = false;
47284 switch (Op.getOpcode()) {
47285 default: return false;
47286 case ISD::SIGN_EXTEND:
47287 case ISD::ZERO_EXTEND:
47288 case ISD::ANY_EXTEND:
47289 break;
47290 case ISD::SHL:
47291 case ISD::SRA:
47292 case ISD::SRL: {
47293 SDValue N0 = Op.getOperand(0);
47294 // Look out for (store (shl (load), x)).
47295 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
47296 return false;
47297 break;
47298 }
47299 case ISD::ADD:
47300 case ISD::MUL:
47301 case ISD::AND:
47302 case ISD::OR:
47303 case ISD::XOR:
47304 Commute = true;
47305 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47306 case ISD::SUB: {
47307 SDValue N0 = Op.getOperand(0);
47308 SDValue N1 = Op.getOperand(1);
47309 // Avoid disabling potential load folding opportunities.
47310 if (MayFoldLoad(N1) &&
47311 (!Commute || !isa<ConstantSDNode>(N0) ||
47312 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
47313 return false;
47314 if (MayFoldLoad(N0) &&
47315 ((Commute && !isa<ConstantSDNode>(N1)) ||
47316 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
47317 return false;
47318 if (IsFoldableAtomicRMW(N0, Op) ||
47319 (Commute && IsFoldableAtomicRMW(N1, Op)))
47320 return false;
47321 }
47322 }
47323
47324 PVT = MVT::i32;
47325 return true;
47326}
47327
47328//===----------------------------------------------------------------------===//
47329// X86 Inline Assembly Support
47330//===----------------------------------------------------------------------===//
47331
47332// Helper to match a string separated by whitespace.
47333static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
47334 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
47335
47336 for (StringRef Piece : Pieces) {
47337 if (!S.startswith(Piece)) // Check if the piece matches.
47338 return false;
47339
47340 S = S.substr(Piece.size());
47341 StringRef::size_type Pos = S.find_first_not_of(" \t");
47342 if (Pos == 0) // We matched a prefix.
47343 return false;
47344
47345 S = S.substr(Pos);
47346 }
47347
47348 return S.empty();
47349}
47350
47351static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
47352
47353 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
47354 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
47355 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
47356 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
47357
47358 if (AsmPieces.size() == 3)
47359 return true;
47360 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
47361 return true;
47362 }
47363 }
47364 return false;
47365}
47366
47367bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
47368 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
47369
47370 const std::string &AsmStr = IA->getAsmString();
47371
47372 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
47373 if (!Ty || Ty->getBitWidth() % 16 != 0)
47374 return false;
47375
47376 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
47377 SmallVector<StringRef, 4> AsmPieces;
47378 SplitString(AsmStr, AsmPieces, ";\n");
47379
47380 switch (AsmPieces.size()) {
47381 default: return false;
47382 case 1:
47383 // FIXME: this should verify that we are targeting a 486 or better. If not,
47384 // we will turn this bswap into something that will be lowered to logical
47385 // ops instead of emitting the bswap asm. For now, we don't support 486 or
47386 // lower so don't worry about this.
47387 // bswap $0
47388 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
47389 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
47390 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
47391 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
47392 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
47393 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
47394 // No need to check constraints, nothing other than the equivalent of
47395 // "=r,0" would be valid here.
47396 return IntrinsicLowering::LowerToByteSwap(CI);
47397 }
47398
47399 // rorw $$8, ${0:w} --> llvm.bswap.i16
47400 if (CI->getType()->isIntegerTy(16) &&
47401 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
47402 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
47403 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
47404 AsmPieces.clear();
47405 StringRef ConstraintsStr = IA->getConstraintString();
47406 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
47407 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
47408 if (clobbersFlagRegisters(AsmPieces))
47409 return IntrinsicLowering::LowerToByteSwap(CI);
47410 }
47411 break;
47412 case 3:
47413 if (CI->getType()->isIntegerTy(32) &&
47414 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
47415 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
47416 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
47417 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
47418 AsmPieces.clear();
47419 StringRef ConstraintsStr = IA->getConstraintString();
47420 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
47421 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
47422 if (clobbersFlagRegisters(AsmPieces))
47423 return IntrinsicLowering::LowerToByteSwap(CI);
47424 }
47425
47426 if (CI->getType()->isIntegerTy(64)) {
47427 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
47428 if (Constraints.size() >= 2 &&
47429 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
47430 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
47431 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
47432 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
47433 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
47434 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
47435 return IntrinsicLowering::LowerToByteSwap(CI);
47436 }
47437 }
47438 break;
47439 }
47440 return false;
47441}
47442
47443static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
47444 X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
47445 .Case("{@cca}", X86::COND_A)
47446 .Case("{@ccae}", X86::COND_AE)
47447 .Case("{@ccb}", X86::COND_B)
47448 .Case("{@ccbe}", X86::COND_BE)
47449 .Case("{@ccc}", X86::COND_B)
47450 .Case("{@cce}", X86::COND_E)
47451 .Case("{@ccz}", X86::COND_E)
47452 .Case("{@ccg}", X86::COND_G)
47453 .Case("{@ccge}", X86::COND_GE)
47454 .Case("{@ccl}", X86::COND_L)
47455 .Case("{@ccle}", X86::COND_LE)
47456 .Case("{@ccna}", X86::COND_BE)
47457 .Case("{@ccnae}", X86::COND_B)
47458 .Case("{@ccnb}", X86::COND_AE)
47459 .Case("{@ccnbe}", X86::COND_A)
47460 .Case("{@ccnc}", X86::COND_AE)
47461 .Case("{@ccne}", X86::COND_NE)
47462 .Case("{@ccnz}", X86::COND_NE)
47463 .Case("{@ccng}", X86::COND_LE)
47464 .Case("{@ccnge}", X86::COND_L)
47465 .Case("{@ccnl}", X86::COND_GE)
47466 .Case("{@ccnle}", X86::COND_G)
47467 .Case("{@ccno}", X86::COND_NO)
47468 .Case("{@ccnp}", X86::COND_P)
47469 .Case("{@ccns}", X86::COND_NS)
47470 .Case("{@cco}", X86::COND_O)
47471 .Case("{@ccp}", X86::COND_P)
47472 .Case("{@ccs}", X86::COND_S)
47473 .Default(X86::COND_INVALID);
47474 return Cond;
47475}
47476
47477/// Given a constraint letter, return the type of constraint for this target.
47478X86TargetLowering::ConstraintType
47479X86TargetLowering::getConstraintType(StringRef Constraint) const {
47480 if (Constraint.size() == 1) {
47481 switch (Constraint[0]) {
47482 case 'R':
47483 case 'q':
47484 case 'Q':
47485 case 'f':
47486 case 't':
47487 case 'u':
47488 case 'y':
47489 case 'x':
47490 case 'v':
47491 case 'Y':
47492 case 'l':
47493 case 'k': // AVX512 masking registers.
47494 return C_RegisterClass;
47495 case 'a':
47496 case 'b':
47497 case 'c':
47498 case 'd':
47499 case 'S':
47500 case 'D':
47501 case 'A':
47502 return C_Register;
47503 case 'I':
47504 case 'J':
47505 case 'K':
47506 case 'N':
47507 case 'G':
47508 case 'L':
47509 case 'M':
47510 return C_Immediate;
47511 case 'C':
47512 case 'e':
47513 case 'Z':
47514 return C_Other;
47515 default:
47516 break;
47517 }
47518 }
47519 else if (Constraint.size() == 2) {
47520 switch (Constraint[0]) {
47521 default:
47522 break;
47523 case 'Y':
47524 switch (Constraint[1]) {
47525 default:
47526 break;
47527 case 'z':
47528 case '0':
47529 return C_Register;
47530 case 'i':
47531 case 'm':
47532 case 'k':
47533 case 't':
47534 case '2':
47535 return C_RegisterClass;
47536 }
47537 }
47538 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
47539 return C_Other;
47540 return TargetLowering::getConstraintType(Constraint);
47541}
47542
47543/// Examine constraint type and operand type and determine a weight value.
47544/// This object must already have been set up with the operand type
47545/// and the current alternative constraint selected.
47546TargetLowering::ConstraintWeight
47547 X86TargetLowering::getSingleConstraintMatchWeight(
47548 AsmOperandInfo &info, const char *constraint) const {
47549 ConstraintWeight weight = CW_Invalid;
47550 Value *CallOperandVal = info.CallOperandVal;
47551 // If we don't have a value, we can't do a match,
47552 // but allow it at the lowest weight.
47553 if (!CallOperandVal)
47554 return CW_Default;
47555 Type *type = CallOperandVal->getType();
47556 // Look at the constraint type.
47557 switch (*constraint) {
47558 default:
47559 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
47560 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47561 case 'R':
47562 case 'q':
47563 case 'Q':
47564 case 'a':
47565 case 'b':
47566 case 'c':
47567 case 'd':
47568 case 'S':
47569 case 'D':
47570 case 'A':
47571 if (CallOperandVal->getType()->isIntegerTy())
47572 weight = CW_SpecificReg;
47573 break;
47574 case 'f':
47575 case 't':
47576 case 'u':
47577 if (type->isFloatingPointTy())
47578 weight = CW_SpecificReg;
47579 break;
47580 case 'y':
47581 if (type->isX86_MMXTy() && Subtarget.hasMMX())
47582 weight = CW_SpecificReg;
47583 break;
47584 case 'Y': {
47585 unsigned Size = StringRef(constraint).size();
47586 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
47587 char NextChar = Size == 2 ? constraint[1] : 'i';
47588 if (Size > 2)
47589 break;
47590 switch (NextChar) {
47591 default:
47592 return CW_Invalid;
47593 // XMM0
47594 case 'z':
47595 case '0':
47596 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
47597 return CW_SpecificReg;
47598 return CW_Invalid;
47599 // Conditional OpMask regs (AVX512)
47600 case 'k':
47601 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
47602 return CW_Register;
47603 return CW_Invalid;
47604 // Any MMX reg
47605 case 'm':
47606 if (type->isX86_MMXTy() && Subtarget.hasMMX())
47607 return weight;
47608 return CW_Invalid;
47609 // Any SSE reg when ISA >= SSE2, same as 'Y'
47610 case 'i':
47611 case 't':
47612 case '2':
47613 if (!Subtarget.hasSSE2())
47614 return CW_Invalid;
47615 break;
47616 }
47617 // Fall through (handle "Y" constraint).
47618 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47619 }
47620 case 'v':
47621 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
47622 weight = CW_Register;
47623 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47624 case 'x':
47625 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
47626 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
47627 weight = CW_Register;
47628 break;
47629 case 'k':
47630 // Enable conditional vector operations using %k<#> registers.
47631 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
47632 weight = CW_Register;
47633 break;
47634 case 'I':
47635 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
47636 if (C->getZExtValue() <= 31)
47637 weight = CW_Constant;
47638 }
47639 break;
47640 case 'J':
47641 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47642 if (C->getZExtValue() <= 63)
47643 weight = CW_Constant;
47644 }
47645 break;
47646 case 'K':
47647 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47648 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
47649 weight = CW_Constant;
47650 }
47651 break;
47652 case 'L':
47653 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47654 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
47655 weight = CW_Constant;
47656 }
47657 break;
47658 case 'M':
47659 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47660 if (C->getZExtValue() <= 3)
47661 weight = CW_Constant;
47662 }
47663 break;
47664 case 'N':
47665 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47666 if (C->getZExtValue() <= 0xff)
47667 weight = CW_Constant;
47668 }
47669 break;
47670 case 'G':
47671 case 'C':
47672 if (isa<ConstantFP>(CallOperandVal)) {
47673 weight = CW_Constant;
47674 }
47675 break;
47676 case 'e':
47677 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47678 if ((C->getSExtValue() >= -0x80000000LL) &&
47679 (C->getSExtValue() <= 0x7fffffffLL))
47680 weight = CW_Constant;
47681 }
47682 break;
47683 case 'Z':
47684 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
47685 if (C->getZExtValue() <= 0xffffffff)
47686 weight = CW_Constant;
47687 }
47688 break;
47689 }
47690 return weight;
47691}
47692
47693/// Try to replace an X constraint, which matches anything, with another that
47694/// has more specific requirements based on the type of the corresponding
47695/// operand.
47696const char *X86TargetLowering::
47697LowerXConstraint(EVT ConstraintVT) const {
47698 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
47699 // 'f' like normal targets.
47700 if (ConstraintVT.isFloatingPoint()) {
47701 if (Subtarget.hasSSE2())
47702 return "Y";
47703 if (Subtarget.hasSSE1())
47704 return "x";
47705 }
47706
47707 return TargetLowering::LowerXConstraint(ConstraintVT);
47708}
47709
47710// Lower @cc targets via setcc.
47711SDValue X86TargetLowering::LowerAsmOutputForConstraint(
47712 SDValue &Chain, SDValue &Flag, SDLoc DL, const AsmOperandInfo &OpInfo,
47713 SelectionDAG &DAG) const {
47714 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
47715 if (Cond == X86::COND_INVALID)
47716 return SDValue();
47717 // Check that return type is valid.
47718 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
47719 OpInfo.ConstraintVT.getSizeInBits() < 8)
47720 report_fatal_error("Flag output operand is of invalid type");
47721
47722 // Get EFLAGS register. Only update chain when copyfrom is glued.
47723 if (Flag.getNode()) {
47724 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);
47725 Chain = Flag.getValue(1);
47726 } else
47727 Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
47728 // Extract CC code.
47729 SDValue CC = getSETCC(Cond, Flag, DL, DAG);
47730 // Extend to 32-bits
47731 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
47732
47733 return Result;
47734}
47735
47736/// Lower the specified operand into the Ops vector.
47737/// If it is invalid, don't add anything to Ops.
47738void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
47739 std::string &Constraint,
47740 std::vector<SDValue>&Ops,
47741 SelectionDAG &DAG) const {
47742 SDValue Result;
47743
47744 // Only support length 1 constraints for now.
47745 if (Constraint.length() > 1) return;
47746
47747 char ConstraintLetter = Constraint[0];
47748 switch (ConstraintLetter) {
47749 default: break;
47750 case 'I':
47751 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47752 if (C->getZExtValue() <= 31) {
47753 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47754 Op.getValueType());
47755 break;
47756 }
47757 }
47758 return;
47759 case 'J':
47760 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47761 if (C->getZExtValue() <= 63) {
47762 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47763 Op.getValueType());
47764 break;
47765 }
47766 }
47767 return;
47768 case 'K':
47769 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47770 if (isInt<8>(C->getSExtValue())) {
47771 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47772 Op.getValueType());
47773 break;
47774 }
47775 }
47776 return;
47777 case 'L':
47778 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47779 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
47780 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
47781 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
47782 Op.getValueType());
47783 break;
47784 }
47785 }
47786 return;
47787 case 'M':
47788 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47789 if (C->getZExtValue() <= 3) {
47790 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47791 Op.getValueType());
47792 break;
47793 }
47794 }
47795 return;
47796 case 'N':
47797 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47798 if (C->getZExtValue() <= 255) {
47799 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47800 Op.getValueType());
47801 break;
47802 }
47803 }
47804 return;
47805 case 'O':
47806 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47807 if (C->getZExtValue() <= 127) {
47808 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47809 Op.getValueType());
47810 break;
47811 }
47812 }
47813 return;
47814 case 'e': {
47815 // 32-bit signed value
47816 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47817 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
47818 C->getSExtValue())) {
47819 // Widen to 64 bits here to get it sign extended.
47820 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
47821 break;
47822 }
47823 // FIXME gcc accepts some relocatable values here too, but only in certain
47824 // memory models; it's complicated.
47825 }
47826 return;
47827 }
47828 case 'Z': {
47829 // 32-bit unsigned value
47830 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
47831 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
47832 C->getZExtValue())) {
47833 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
47834 Op.getValueType());
47835 break;
47836 }
47837 }
47838 // FIXME gcc accepts some relocatable values here too, but only in certain
47839 // memory models; it's complicated.
47840 return;
47841 }
47842 case 'i': {
47843 // Literal immediates are always ok.
47844 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
47845 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
47846 BooleanContent BCont = getBooleanContents(MVT::i64);
47847 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
47848 : ISD::SIGN_EXTEND;
47849 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
47850 : CST->getSExtValue();
47851 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
47852 break;
47853 }
47854
47855 // In any sort of PIC mode addresses need to be computed at runtime by
47856 // adding in a register or some sort of table lookup. These can't
47857 // be used as immediates.
47858 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
47859 return;
47860
47861 // If we are in non-pic codegen mode, we allow the address of a global (with
47862 // an optional displacement) to be used with 'i'.
47863 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
47864 // If we require an extra load to get this address, as in PIC mode, we
47865 // can't accept it.
47866 if (isGlobalStubReference(
47867 Subtarget.classifyGlobalReference(GA->getGlobal())))
47868 return;
47869 break;
47870 }
47871 }
47872
47873 if (Result.getNode()) {
47874 Ops.push_back(Result);
47875 return;
47876 }
47877 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
47878}
47879
47880/// Check if \p RC is a general purpose register class.
47881/// I.e., GR* or one of their variant.
47882static bool isGRClass(const TargetRegisterClass &RC) {
47883 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
47884 RC.hasSuperClassEq(&X86::GR16RegClass) ||
47885 RC.hasSuperClassEq(&X86::GR32RegClass) ||
47886 RC.hasSuperClassEq(&X86::GR64RegClass) ||
47887 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
47888}
47889
47890/// Check if \p RC is a vector register class.
47891/// I.e., FR* / VR* or one of their variant.
47892static bool isFRClass(const TargetRegisterClass &RC) {
47893 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
47894 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
47895 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
47896 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
47897 RC.hasSuperClassEq(&X86::VR512RegClass);
47898}
47899
47900/// Check if \p RC is a mask register class.
47901/// I.e., VK* or one of their variant.
47902static bool isVKClass(const TargetRegisterClass &RC) {
47903 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
47904 RC.hasSuperClassEq(&X86::VK2RegClass) ||
47905 RC.hasSuperClassEq(&X86::VK4RegClass) ||
47906 RC.hasSuperClassEq(&X86::VK8RegClass) ||
47907 RC.hasSuperClassEq(&X86::VK16RegClass) ||
47908 RC.hasSuperClassEq(&X86::VK32RegClass) ||
47909 RC.hasSuperClassEq(&X86::VK64RegClass);
47910}
47911
47912std::pair<unsigned, const TargetRegisterClass *>
47913X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
47914 StringRef Constraint,
47915 MVT VT) const {
47916 // First, see if this is a constraint that directly corresponds to an LLVM
47917 // register class.
47918 if (Constraint.size() == 1) {
47919 // GCC Constraint Letters
47920 switch (Constraint[0]) {
47921 default: break;
47922 // 'A' means [ER]AX + [ER]DX.
47923 case 'A':
47924 if (Subtarget.is64Bit())
47925 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
47926 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47927, __PRETTY_FUNCTION__))
47927 "Expecting 64, 32 or 16 bit subtarget")(((Subtarget.is32Bit() || Subtarget.is16Bit()) && "Expecting 64, 32 or 16 bit subtarget"
) ? static_cast<void> (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 47927, __PRETTY_FUNCTION__))
;
47928 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
47929
47930 // TODO: Slight differences here in allocation order and leaving
47931 // RIP in the class. Do they matter any more here than they do
47932 // in the normal allocation?
47933 case 'k':
47934 if (Subtarget.hasAVX512()) {
47935 if (VT == MVT::i1)
47936 return std::make_pair(0U, &X86::VK1RegClass);
47937 if (VT == MVT::i8)
47938 return std::make_pair(0U, &X86::VK8RegClass);
47939 if (VT == MVT::i16)
47940 return std::make_pair(0U, &X86::VK16RegClass);
47941 }
47942 if (Subtarget.hasBWI()) {
47943 if (VT == MVT::i32)
47944 return std::make_pair(0U, &X86::VK32RegClass);
47945 if (VT == MVT::i64)
47946 return std::make_pair(0U, &X86::VK64RegClass);
47947 }
47948 break;
47949 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
47950 if (Subtarget.is64Bit()) {
47951 if (VT == MVT::i32 || VT == MVT::f32)
47952 return std::make_pair(0U, &X86::GR32RegClass);
47953 if (VT == MVT::i16)
47954 return std::make_pair(0U, &X86::GR16RegClass);
47955 if (VT == MVT::i8 || VT == MVT::i1)
47956 return std::make_pair(0U, &X86::GR8RegClass);
47957 if (VT == MVT::i64 || VT == MVT::f64)
47958 return std::make_pair(0U, &X86::GR64RegClass);
47959 break;
47960 }
47961 LLVM_FALLTHROUGH[[gnu::fallthrough]];
47962 // 32-bit fallthrough
47963 case 'Q': // Q_REGS
47964 if (VT == MVT::i32 || VT == MVT::f32)
47965 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
47966 if (VT == MVT::i16)
47967 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
47968 if (VT == MVT::i8 || VT == MVT::i1)
47969 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
47970 if (VT == MVT::i64)
47971 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
47972 break;
47973 case 'r': // GENERAL_REGS
47974 case 'l': // INDEX_REGS
47975 if (VT == MVT::i8 || VT == MVT::i1)
47976 return std::make_pair(0U, &X86::GR8RegClass);
47977 if (VT == MVT::i16)
47978 return std::make_pair(0U, &X86::GR16RegClass);
47979 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
47980 return std::make_pair(0U, &X86::GR32RegClass);
47981 return std::make_pair(0U, &X86::GR64RegClass);
47982 case 'R': // LEGACY_REGS
47983 if (VT == MVT::i8 || VT == MVT::i1)
47984 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
47985 if (VT == MVT::i16)
47986 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
47987 if (VT == MVT::i32 || !Subtarget.is64Bit())
47988 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
47989 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
47990 case 'f': // FP Stack registers.
47991 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
47992 // value to the correct fpstack register class.
47993 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
47994 return std::make_pair(0U, &X86::RFP32RegClass);
47995 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
47996 return std::make_pair(0U, &X86::RFP64RegClass);
47997 return std::make_pair(0U, &X86::RFP80RegClass);
47998 case 'y': // MMX_REGS if MMX allowed.
47999 if (!Subtarget.hasMMX()) break;
48000 return std::make_pair(0U, &X86::VR64RegClass);
48001 case 'Y': // SSE_REGS if SSE2 allowed
48002 if (!Subtarget.hasSSE2()) break;
48003 LLVM_FALLTHROUGH[[gnu::fallthrough]];
48004 case 'v':
48005 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
48006 if (!Subtarget.hasSSE1()) break;
48007 bool VConstraint = (Constraint[0] == 'v');
48008
48009 switch (VT.SimpleTy) {
48010 default: break;
48011 // Scalar SSE types.
48012 case MVT::f32:
48013 case MVT::i32:
48014 if (VConstraint && Subtarget.hasVLX())
48015 return std::make_pair(0U, &X86::FR32XRegClass);
48016 return std::make_pair(0U, &X86::FR32RegClass);
48017 case MVT::f64:
48018 case MVT::i64:
48019 if (VConstraint && Subtarget.hasVLX())
48020 return std::make_pair(0U, &X86::FR64XRegClass);
48021 return std::make_pair(0U, &X86::FR64RegClass);
48022 // TODO: Handle i128 in FR128RegClass after it is tested well.
48023 // Vector types and fp128.
48024 case MVT::f128:
48025 case MVT::v16i8:
48026 case MVT::v8i16:
48027 case MVT::v4i32:
48028 case MVT::v2i64:
48029 case MVT::v4f32:
48030 case MVT::v2f64:
48031 if (VConstraint && Subtarget.hasVLX())
48032 return std::make_pair(0U, &X86::VR128XRegClass);
48033 return std::make_pair(0U, &X86::VR128RegClass);
48034 // AVX types.
48035 case MVT::v32i8:
48036 case MVT::v16i16:
48037 case MVT::v8i32:
48038 case MVT::v4i64:
48039 case MVT::v8f32:
48040 case MVT::v4f64:
48041 if (VConstraint && Subtarget.hasVLX())
48042 return std::make_pair(0U, &X86::VR256XRegClass);
48043 if (Subtarget.hasAVX())
48044 return std::make_pair(0U, &X86::VR256RegClass);
48045 break;
48046 case MVT::v8f64:
48047 case MVT::v16f32:
48048 case MVT::v16i32:
48049 case MVT::v8i64:
48050 if (!Subtarget.hasAVX512()) break;
48051 if (VConstraint)
48052 return std::make_pair(0U, &X86::VR512RegClass);
48053 return std::make_pair(0U, &X86::VR512_0_15RegClass);
48054 }
48055 break;
48056 }
48057 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
48058 switch (Constraint[1]) {
48059 default:
48060 break;
48061 case 'i':
48062 case 't':
48063 case '2':
48064 return getRegForInlineAsmConstraint(TRI, "Y", VT);
48065 case 'm':
48066 if (!Subtarget.hasMMX()) break;
48067 return std::make_pair(0U, &X86::VR64RegClass);
48068 case 'z':
48069 case '0':
48070 if (!Subtarget.hasSSE1()) break;
48071 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
48072 case 'k':
48073 // This register class doesn't allocate k0 for masked vector operation.
48074 if (Subtarget.hasAVX512()) {
48075 if (VT == MVT::i1)
48076 return std::make_pair(0U, &X86::VK1WMRegClass);
48077 if (VT == MVT::i8)
48078 return std::make_pair(0U, &X86::VK8WMRegClass);
48079 if (VT == MVT::i16)
48080 return std::make_pair(0U, &X86::VK16WMRegClass);
48081 }
48082 if (Subtarget.hasBWI()) {
48083 if (VT == MVT::i32)
48084 return std::make_pair(0U, &X86::VK32WMRegClass);
48085 if (VT == MVT::i64)
48086 return std::make_pair(0U, &X86::VK64WMRegClass);
48087 }
48088 break;
48089 }
48090 }
48091
48092 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
48093 return std::make_pair(0U, &X86::GR32RegClass);
48094
48095 // Use the default implementation in TargetLowering to convert the register
48096 // constraint into a member of a register class.
48097 std::pair<unsigned, const TargetRegisterClass*> Res;
48098 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
48099
48100 // Not found as a standard register?
48101 if (!Res.second) {
48102 // Map st(0) -> st(7) -> ST0
48103 if (Constraint.size() == 7 && Constraint[0] == '{' &&
48104 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
48105 Constraint[3] == '(' &&
48106 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
48107 Constraint[5] == ')' && Constraint[6] == '}') {
48108 // st(7) is not allocatable and thus not a member of RFP80. Return
48109 // singleton class in cases where we have a reference to it.
48110 if (Constraint[4] == '7')
48111 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
48112 return std::make_pair(X86::FP0 + Constraint[4] - '0',
48113 &X86::RFP80RegClass);
48114 }
48115
48116 // GCC allows "st(0)" to be called just plain "st".
48117 if (StringRef("{st}").equals_lower(Constraint))
48118 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
48119
48120 // flags -> EFLAGS
48121 if (StringRef("{flags}").equals_lower(Constraint))
48122 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
48123
48124 // dirflag -> DF
48125 if (StringRef("{dirflag}").equals_lower(Constraint))
48126 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
48127
48128 // fpsr -> FPSW
48129 if (StringRef("{fpsr}").equals_lower(Constraint))
48130 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
48131
48132 return Res;
48133 }
48134
48135 // Make sure it isn't a register that requires 64-bit mode.
48136 if (!Subtarget.is64Bit() &&
48137 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
48138 TRI->getEncodingValue(Res.first) >= 8) {
48139 // Register requires REX prefix, but we're in 32-bit mode.
48140 return std::make_pair(0, nullptr);
48141 }
48142
48143 // Make sure it isn't a register that requires AVX512.
48144 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
48145 TRI->getEncodingValue(Res.first) & 0x10) {
48146 // Register requires EVEX prefix.
48147 return std::make_pair(0, nullptr);
48148 }
48149
48150 // Otherwise, check to see if this is a register class of the wrong value
48151 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
48152 // turn into {ax},{dx}.
48153 // MVT::Other is used to specify clobber names.
48154 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
48155 return Res; // Correct type already, nothing to do.
48156
48157 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
48158 // return "eax". This should even work for things like getting 64bit integer
48159 // registers when given an f64 type.
48160 const TargetRegisterClass *Class = Res.second;
48161 // The generic code will match the first register class that contains the
48162 // given register. Thus, based on the ordering of the tablegened file,
48163 // the "plain" GR classes might not come first.
48164 // Therefore, use a helper method.
48165 if (isGRClass(*Class)) {
48166 unsigned Size = VT.getSizeInBits();
48167 if (Size == 1) Size = 8;
48168 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
48169 if (DestReg > 0) {
48170 bool is64Bit = Subtarget.is64Bit();
48171 const TargetRegisterClass *RC =
48172 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
48173 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
48174 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
48175 : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
48176 : nullptr;
48177 if (Size == 64 && !is64Bit) {
48178 // Model GCC's behavior here and select a fixed pair of 32-bit
48179 // registers.
48180 switch (DestReg) {
48181 case X86::RAX:
48182 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
48183 case X86::RDX:
48184 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
48185 case X86::RCX:
48186 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
48187 case X86::RBX:
48188 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
48189 case X86::RSI:
48190 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
48191 case X86::RDI:
48192 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
48193 case X86::RBP:
48194 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
48195 default:
48196 return std::make_pair(0, nullptr);
48197 }
48198 }
48199 if (RC && RC->contains(DestReg))
48200 return std::make_pair(DestReg, RC);
48201 return Res;
48202 }
48203 // No register found/type mismatch.
48204 return std::make_pair(0, nullptr);
48205 } else if (isFRClass(*Class)) {
48206 // Handle references to XMM physical registers that got mapped into the
48207 // wrong class. This can happen with constraints like {xmm0} where the
48208 // target independent register mapper will just pick the first match it can
48209 // find, ignoring the required type.
48210
48211 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
48212 if (VT == MVT::f32 || VT == MVT::i32)
48213 Res.second = &X86::FR32XRegClass;
48214 else if (VT == MVT::f64 || VT == MVT::i64)
48215 Res.second = &X86::FR64XRegClass;
48216 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
48217 Res.second = &X86::VR128XRegClass;
48218 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
48219 Res.second = &X86::VR256XRegClass;
48220 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
48221 Res.second = &X86::VR512RegClass;
48222 else {
48223 // Type mismatch and not a clobber: Return an error;
48224 Res.first = 0;
48225 Res.second = nullptr;
48226 }
48227 } else if (isVKClass(*Class)) {
48228 if (VT == MVT::i1)
48229 Res.second = &X86::VK1RegClass;
48230 else if (VT == MVT::i8)
48231 Res.second = &X86::VK8RegClass;
48232 else if (VT == MVT::i16)
48233 Res.second = &X86::VK16RegClass;
48234 else if (VT == MVT::i32)
48235 Res.second = &X86::VK32RegClass;
48236 else if (VT == MVT::i64)
48237 Res.second = &X86::VK64RegClass;
48238 else {
48239 // Type mismatch and not a clobber: Return an error;
48240 Res.first = 0;
48241 Res.second = nullptr;
48242 }
48243 }
48244
48245 return Res;
48246}
48247
48248int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
48249 const AddrMode &AM, Type *Ty,
48250 unsigned AS) const {
48251 // Scaling factors are not free at all.
48252 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
48253 // will take 2 allocations in the out of order engine instead of 1
48254 // for plain addressing mode, i.e. inst (reg1).
48255 // E.g.,
48256 // vaddps (%rsi,%rdx), %ymm0, %ymm1
48257 // Requires two allocations (one for the load, one for the computation)
48258 // whereas:
48259 // vaddps (%rsi), %ymm0, %ymm1
48260 // Requires just 1 allocation, i.e., freeing allocations for other operations
48261 // and having less micro operations to execute.
48262 //
48263 // For some X86 architectures, this is even worse because for instance for
48264 // stores, the complex addressing mode forces the instruction to use the
48265 // "load" ports instead of the dedicated "store" port.
48266 // E.g., on Haswell:
48267 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
48268 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
48269 if (isLegalAddressingMode(DL, AM, Ty, AS))
48270 // Scale represents reg2 * scale, thus account for 1
48271 // as soon as we use a second register.
48272 return AM.Scale != 0;
48273 return -1;
48274}
48275
48276bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
48277 // Integer division on x86 is expensive. However, when aggressively optimizing
48278 // for code size, we prefer to use a div instruction, as it is usually smaller
48279 // than the alternative sequence.
48280 // The exception to this is vector division. Since x86 doesn't have vector
48281 // integer division, leaving the division as-is is a loss even in terms of
48282 // size, because it will have to be scalarized, while the alternative code
48283 // sequence can be performed in vector form.
48284 bool OptSize =
48285 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
48286 return OptSize && !VT.isVector();
48287}
48288
48289void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
48290 if (!Subtarget.is64Bit())
48291 return;
48292
48293 // Update IsSplitCSR in X86MachineFunctionInfo.
48294 X86MachineFunctionInfo *AFI =
48295 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
48296 AFI->setIsSplitCSR(true);
48297}
48298
48299void X86TargetLowering::insertCopiesSplitCSR(
48300 MachineBasicBlock *Entry,
48301 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
48302 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
48303 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
48304 if (!IStart)
48305 return;
48306
48307 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
48308 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
48309 MachineBasicBlock::iterator MBBI = Entry->begin();
48310 for (const MCPhysReg *I = IStart; *I; ++I) {
48311 const TargetRegisterClass *RC = nullptr;
48312 if (X86::GR64RegClass.contains(*I))
48313 RC = &X86::GR64RegClass;
48314 else
48315 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48315)
;
48316
48317 Register NewVR = MRI->createVirtualRegister(RC);
48318 // Create copy from CSR to a virtual register.
48319 // FIXME: this currently does not emit CFI pseudo-instructions, it works
48320 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
48321 // nounwind. If we want to generalize this later, we may need to emit
48322 // CFI pseudo-instructions.
48323 assert(((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48325, __PRETTY_FUNCTION__))
48324 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48325, __PRETTY_FUNCTION__))
48325 "Function should be nounwind in insertCopiesSplitCSR!")((Entry->getParent()->getFunction().hasFnAttribute(Attribute
::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? static_cast<void> (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48325, __PRETTY_FUNCTION__))
;
48326 Entry->addLiveIn(*I);
48327 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
48328 .addReg(*I);
48329
48330 // Insert the copy-back instructions right before the terminator.
48331 for (auto *Exit : Exits)
48332 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
48333 TII->get(TargetOpcode::COPY), *I)
48334 .addReg(NewVR);
48335 }
48336}
48337
48338bool X86TargetLowering::supportSwiftError() const {
48339 return Subtarget.is64Bit();
48340}
48341
48342/// Returns true if stack probing through a function call is requested.
48343bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
48344 return !getStackProbeSymbolName(MF).empty();
48345}
48346
48347/// Returns true if stack probing through inline assembly is requested.
48348bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
48349
48350 // No inline stack probe for Windows, they have their own mechanism.
48351 if (Subtarget.isOSWindows() ||
48352 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
48353 return false;
48354
48355 // If the function specifically requests inline stack probes, emit them.
48356 if (MF.getFunction().hasFnAttribute("probe-stack"))
48357 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
48358 "inline-asm";
48359
48360 return false;
48361}
48362
48363/// Returns the name of the symbol used to emit stack probes or the empty
48364/// string if not applicable.
48365StringRef
48366X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
48367 // Inline Stack probes disable stack probe call
48368 if (hasInlineStackProbe(MF))
48369 return "";
48370
48371 // If the function specifically requests stack probes, emit them.
48372 if (MF.getFunction().hasFnAttribute("probe-stack"))
48373 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
48374
48375 // Generally, if we aren't on Windows, the platform ABI does not include
48376 // support for stack probes, so don't emit them.
48377 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
48378 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
48379 return "";
48380
48381 // We need a stack probe to conform to the Windows ABI. Choose the right
48382 // symbol.
48383 if (Subtarget.is64Bit())
48384 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
48385 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
48386}
48387
48388unsigned
48389X86TargetLowering::getStackProbeSize(MachineFunction &MF) const {
48390 // The default stack probe size is 4096 if the function has no stackprobesize
48391 // attribute.
48392 unsigned StackProbeSize = 4096;
48393 const Function &Fn = MF.getFunction();
48394 if (Fn.hasFnAttribute("stack-probe-size"))
48395 Fn.getFnAttribute("stack-probe-size")
48396 .getValueAsString()
48397 .getAsInteger(0, StackProbeSize);
48398 return StackProbeSize;
48399}

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h

1//===- Support/MachineValueType.h - Machine-Level types ---------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the set of machine-level target independent types which
10// legal values in the code generator use.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_SUPPORT_MACHINEVALUETYPE_H
15#define LLVM_SUPPORT_MACHINEVALUETYPE_H
16
17#include "llvm/ADT/iterator_range.h"
18#include "llvm/Support/ErrorHandling.h"
19#include "llvm/Support/MathExtras.h"
20#include "llvm/Support/TypeSize.h"
21#include <cassert>
22
23namespace llvm {
24
25 class Type;
26
27 /// Machine Value Type. Every type that is supported natively by some
28 /// processor targeted by LLVM occurs here. This means that any legal value
29 /// type can be represented by an MVT.
30 class MVT {
31 public:
32 enum SimpleValueType : uint8_t {
33 // Simple value types that aren't explicitly part of this enumeration
34 // are considered extended value types.
35 INVALID_SIMPLE_VALUE_TYPE = 0,
36
37 // If you change this numbering, you must change the values in
38 // ValueTypes.td as well!
39 Other = 1, // This is a non-standard value
40 i1 = 2, // This is a 1 bit integer value
41 i8 = 3, // This is an 8 bit integer value
42 i16 = 4, // This is a 16 bit integer value
43 i32 = 5, // This is a 32 bit integer value
44 i64 = 6, // This is a 64 bit integer value
45 i128 = 7, // This is a 128 bit integer value
46
47 FIRST_INTEGER_VALUETYPE = i1,
48 LAST_INTEGER_VALUETYPE = i128,
49
50 f16 = 8, // This is a 16 bit floating point value
51 f32 = 9, // This is a 32 bit floating point value
52 f64 = 10, // This is a 64 bit floating point value
53 f80 = 11, // This is a 80 bit floating point value
54 f128 = 12, // This is a 128 bit floating point value
55 ppcf128 = 13, // This is a PPC 128-bit floating point value
56
57 FIRST_FP_VALUETYPE = f16,
58 LAST_FP_VALUETYPE = ppcf128,
59
60 v1i1 = 14, // 1 x i1
61 v2i1 = 15, // 2 x i1
62 v4i1 = 16, // 4 x i1
63 v8i1 = 17, // 8 x i1
64 v16i1 = 18, // 16 x i1
65 v32i1 = 19, // 32 x i1
66 v64i1 = 20, // 64 x i1
67 v128i1 = 21, // 128 x i1
68 v256i1 = 22, // 256 x i1
69 v512i1 = 23, // 512 x i1
70 v1024i1 = 24, // 1024 x i1
71
72 v1i8 = 25, // 1 x i8
73 v2i8 = 26, // 2 x i8
74 v4i8 = 27, // 4 x i8
75 v8i8 = 28, // 8 x i8
76 v16i8 = 29, // 16 x i8
77 v32i8 = 30, // 32 x i8
78 v64i8 = 31, // 64 x i8
79 v128i8 = 32, //128 x i8
80 v256i8 = 33, //256 x i8
81
82 v1i16 = 34, // 1 x i16
83 v2i16 = 35, // 2 x i16
84 v3i16 = 36, // 3 x i16
85 v4i16 = 37, // 4 x i16
86 v8i16 = 38, // 8 x i16
87 v16i16 = 39, // 16 x i16
88 v32i16 = 40, // 32 x i16
89 v64i16 = 41, // 64 x i16
90 v128i16 = 42, //128 x i16
91
92 v1i32 = 43, // 1 x i32
93 v2i32 = 44, // 2 x i32
94 v3i32 = 45, // 3 x i32
95 v4i32 = 46, // 4 x i32
96 v5i32 = 47, // 5 x i32
97 v8i32 = 48, // 8 x i32
98 v16i32 = 49, // 16 x i32
99 v32i32 = 50, // 32 x i32
100 v64i32 = 51, // 64 x i32
101 v128i32 = 52, // 128 x i32
102 v256i32 = 53, // 256 x i32
103 v512i32 = 54, // 512 x i32
104 v1024i32 = 55, // 1024 x i32
105 v2048i32 = 56, // 2048 x i32
106
107 v1i64 = 57, // 1 x i64
108 v2i64 = 58, // 2 x i64
109 v4i64 = 59, // 4 x i64
110 v8i64 = 60, // 8 x i64
111 v16i64 = 61, // 16 x i64
112 v32i64 = 62, // 32 x i64
113
114 v1i128 = 63, // 1 x i128
115
116 FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
117 LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE = v1i128,
118
119 v2f16 = 64, // 2 x f16
120 v3f16 = 65, // 3 x f16
121 v4f16 = 66, // 4 x f16
122 v8f16 = 67, // 8 x f16
123 v16f16 = 68, // 16 x f16
124 v32f16 = 69, // 32 x f16
125 v1f32 = 70, // 1 x f32
126 v2f32 = 71, // 2 x f32
127 v3f32 = 72, // 3 x f32
128 v4f32 = 73, // 4 x f32
129 v5f32 = 74, // 5 x f32
130 v8f32 = 75, // 8 x f32
131 v16f32 = 76, // 16 x f32
132 v32f32 = 77, // 32 x f32
133 v64f32 = 78, // 64 x f32
134 v128f32 = 79, // 128 x f32
135 v256f32 = 80, // 256 x f32
136 v512f32 = 81, // 512 x f32
137 v1024f32 = 82, // 1024 x f32
138 v2048f32 = 83, // 2048 x f32
139 v1f64 = 84, // 1 x f64
140 v2f64 = 85, // 2 x f64
141 v4f64 = 86, // 4 x f64
142 v8f64 = 87, // 8 x f64
143
144 FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE = v2f16,
145 LAST_FP_FIXEDLEN_VECTOR_VALUETYPE = v8f64,
146
147 FIRST_FIXEDLEN_VECTOR_VALUETYPE = v1i1,
148 LAST_FIXEDLEN_VECTOR_VALUETYPE = v8f64,
149
150 nxv1i1 = 88, // n x 1 x i1
151 nxv2i1 = 89, // n x 2 x i1
152 nxv4i1 = 90, // n x 4 x i1
153 nxv8i1 = 91, // n x 8 x i1
154 nxv16i1 = 92, // n x 16 x i1
155 nxv32i1 = 93, // n x 32 x i1
156
157 nxv1i8 = 94, // n x 1 x i8
158 nxv2i8 = 95, // n x 2 x i8
159 nxv4i8 = 96, // n x 4 x i8
160 nxv8i8 = 97, // n x 8 x i8
161 nxv16i8 = 98, // n x 16 x i8
162 nxv32i8 = 99, // n x 32 x i8
163
164 nxv1i16 = 100, // n x 1 x i16
165 nxv2i16 = 101, // n x 2 x i16
166 nxv4i16 = 102, // n x 4 x i16
167 nxv8i16 = 103, // n x 8 x i16
168 nxv16i16 = 104, // n x 16 x i16
169 nxv32i16 = 105, // n x 32 x i16
170
171 nxv1i32 = 106, // n x 1 x i32
172 nxv2i32 = 107, // n x 2 x i32
173 nxv4i32 = 108, // n x 4 x i32
174 nxv8i32 = 109, // n x 8 x i32
175 nxv16i32 = 110, // n x 16 x i32
176 nxv32i32 = 111, // n x 32 x i32
177
178 nxv1i64 = 112, // n x 1 x i64
179 nxv2i64 = 113, // n x 2 x i64
180 nxv4i64 = 114, // n x 4 x i64
181 nxv8i64 = 115, // n x 8 x i64
182 nxv16i64 = 116, // n x 16 x i64
183 nxv32i64 = 117, // n x 32 x i64
184
185 FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
186 LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE = nxv32i64,
187
188 nxv2f16 = 118, // n x 2 x f16
189 nxv4f16 = 119, // n x 4 x f16
190 nxv8f16 = 120, // n x 8 x f16
191 nxv1f32 = 121, // n x 1 x f32
192 nxv2f32 = 122, // n x 2 x f32
193 nxv4f32 = 123, // n x 4 x f32
194 nxv8f32 = 124, // n x 8 x f32
195 nxv16f32 = 125, // n x 16 x f32
196 nxv1f64 = 126, // n x 1 x f64
197 nxv2f64 = 127, // n x 2 x f64
198 nxv4f64 = 128, // n x 4 x f64
199 nxv8f64 = 129, // n x 8 x f64
200
201 FIRST_FP_SCALABLE_VECTOR_VALUETYPE = nxv2f16,
202 LAST_FP_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
203
204 FIRST_SCALABLE_VECTOR_VALUETYPE = nxv1i1,
205 LAST_SCALABLE_VECTOR_VALUETYPE = nxv8f64,
206
207 FIRST_VECTOR_VALUETYPE = v1i1,
208 LAST_VECTOR_VALUETYPE = nxv8f64,
209
210 x86mmx = 130, // This is an X86 MMX value
211
212 Glue = 131, // This glues nodes together during pre-RA sched
213
214 isVoid = 132, // This has no value
215
216 Untyped = 133, // This value takes a register, but has
217 // unspecified type. The register class
218 // will be determined by the opcode.
219
220 exnref = 134, // WebAssembly's exnref type
221
222 FIRST_VALUETYPE = 1, // This is always the beginning of the list.
223 LAST_VALUETYPE = 135, // This always remains at the end of the list.
224
225 // This is the current maximum for LAST_VALUETYPE.
226 // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
227 // This value must be a multiple of 32.
228 MAX_ALLOWED_VALUETYPE = 160,
229
230 // A value of type llvm::TokenTy
231 token = 248,
232
233 // This is MDNode or MDString.
234 Metadata = 249,
235
236 // An int value the size of the pointer of the current
237 // target to any address space. This must only be used internal to
238 // tblgen. Other than for overloading, we treat iPTRAny the same as iPTR.
239 iPTRAny = 250,
240
241 // A vector with any length and element size. This is used
242 // for intrinsics that have overloadings based on vector types.
243 // This is only for tblgen's consumption!
244 vAny = 251,
245
246 // Any floating-point or vector floating-point value. This is used
247 // for intrinsics that have overloadings based on floating-point types.
248 // This is only for tblgen's consumption!
249 fAny = 252,
250
251 // An integer or vector integer value of any bit width. This is
252 // used for intrinsics that have overloadings based on integer bit widths.
253 // This is only for tblgen's consumption!
254 iAny = 253,
255
256 // An int value the size of the pointer of the current
257 // target. This should only be used internal to tblgen!
258 iPTR = 254,
259
260 // Any type. This is used for intrinsics that have overloadings.
261 // This is only for tblgen's consumption!
262 Any = 255
263 };
264
265 SimpleValueType SimpleTy = INVALID_SIMPLE_VALUE_TYPE;
266
267 constexpr MVT() = default;
268 constexpr MVT(SimpleValueType SVT) : SimpleTy(SVT) {}
269
270 bool operator>(const MVT& S) const { return SimpleTy > S.SimpleTy; }
271 bool operator<(const MVT& S) const { return SimpleTy < S.SimpleTy; }
272 bool operator==(const MVT& S) const { return SimpleTy == S.SimpleTy; }
273 bool operator!=(const MVT& S) const { return SimpleTy != S.SimpleTy; }
274 bool operator>=(const MVT& S) const { return SimpleTy >= S.SimpleTy; }
275 bool operator<=(const MVT& S) const { return SimpleTy <= S.SimpleTy; }
276
277 /// Return true if this is a valid simple valuetype.
278 bool isValid() const {
279 return (SimpleTy >= MVT::FIRST_VALUETYPE &&
280 SimpleTy < MVT::LAST_VALUETYPE);
281 }
282
283 /// Return true if this is a FP or a vector FP type.
284 bool isFloatingPoint() const {
285 return ((SimpleTy >= MVT::FIRST_FP_VALUETYPE &&
286 SimpleTy <= MVT::LAST_FP_VALUETYPE) ||
287 (SimpleTy >= MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE &&
288 SimpleTy <= MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE) ||
289 (SimpleTy >= MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE &&
290 SimpleTy <= MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE));
291 }
292
293 /// Return true if this is an integer or a vector integer type.
294 bool isInteger() const {
295 return ((SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
296 SimpleTy <= MVT::LAST_INTEGER_VALUETYPE) ||
297 (SimpleTy >= MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE &&
298 SimpleTy <= MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE) ||
299 (SimpleTy >= MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE &&
300 SimpleTy <= MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE));
301 }
302
303 /// Return true if this is an integer, not including vectors.
304 bool isScalarInteger() const {
305 return (SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
306 SimpleTy <= MVT::LAST_INTEGER_VALUETYPE);
307 }
308
309 /// Return true if this is a vector value type.
310 bool isVector() const {
311 return (SimpleTy
20.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
20.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
20.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
20.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
20.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
20.1
Field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
>= MVT::FIRST_VECTOR_VALUETYPE
&&
7
Assuming field 'SimpleTy' is >= FIRST_VECTOR_VALUETYPE
9
Returning the value 1, which participates in a condition later
21
Returning the value 1, which participates in a condition later
312 SimpleTy
20.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
20.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
20.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
20.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
20.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
20.2
Field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
<= MVT::LAST_VECTOR_VALUETYPE
)
;
8
Assuming field 'SimpleTy' is <= LAST_VECTOR_VALUETYPE
313 }
314
315 /// Return true if this is a vector value type where the
316 /// runtime length is machine dependent
317 bool isScalableVector() const {
318 return (SimpleTy >= MVT::FIRST_SCALABLE_VECTOR_VALUETYPE &&
319 SimpleTy <= MVT::LAST_SCALABLE_VECTOR_VALUETYPE);
320 }
321
322 bool isFixedLengthVector() const {
323 return (SimpleTy >= MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE &&
324 SimpleTy <= MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE);
325 }
326
327 /// Return true if this is a 16-bit vector type.
328 bool is16BitVector() const {
329 return (SimpleTy == MVT::v2i8 || SimpleTy == MVT::v1i16 ||
330 SimpleTy == MVT::v16i1);
331 }
332
333 /// Return true if this is a 32-bit vector type.
334 bool is32BitVector() const {
335 return (SimpleTy == MVT::v32i1 || SimpleTy == MVT::v4i8 ||
336 SimpleTy == MVT::v2i16 || SimpleTy == MVT::v1i32 ||
337 SimpleTy == MVT::v2f16 || SimpleTy == MVT::v1f32);
338 }
339
340 /// Return true if this is a 64-bit vector type.
341 bool is64BitVector() const {
342 return (SimpleTy == MVT::v64i1 || SimpleTy == MVT::v8i8 ||
343 SimpleTy == MVT::v4i16 || SimpleTy == MVT::v2i32 ||
344 SimpleTy == MVT::v1i64 || SimpleTy == MVT::v4f16 ||
345 SimpleTy == MVT::v2f32 || SimpleTy == MVT::v1f64);
346 }
347
348 /// Return true if this is a 128-bit vector type.
349 bool is128BitVector() const {
350 return (SimpleTy == MVT::v128i1 || SimpleTy == MVT::v16i8 ||
351 SimpleTy == MVT::v8i16 || SimpleTy == MVT::v4i32 ||
352 SimpleTy == MVT::v2i64 || SimpleTy == MVT::v1i128 ||
353 SimpleTy == MVT::v8f16 || SimpleTy == MVT::v4f32 ||
354 SimpleTy == MVT::v2f64);
355 }
356
357 /// Return true if this is a 256-bit vector type.
358 bool is256BitVector() const {
359 return (SimpleTy == MVT::v16f16 || SimpleTy == MVT::v8f32 ||
360 SimpleTy == MVT::v4f64 || SimpleTy == MVT::v32i8 ||
361 SimpleTy == MVT::v16i16 || SimpleTy == MVT::v8i32 ||
362 SimpleTy == MVT::v4i64 || SimpleTy == MVT::v256i1);
363 }
364
365 /// Return true if this is a 512-bit vector type.
366 bool is512BitVector() const {
367 return (SimpleTy == MVT::v32f16 || SimpleTy == MVT::v16f32 ||
368 SimpleTy == MVT::v8f64 || SimpleTy == MVT::v512i1 ||
369 SimpleTy == MVT::v64i8 || SimpleTy == MVT::v32i16 ||
370 SimpleTy == MVT::v16i32 || SimpleTy == MVT::v8i64);
371 }
372
373 /// Return true if this is a 1024-bit vector type.
374 bool is1024BitVector() const {
375 return (SimpleTy == MVT::v1024i1 || SimpleTy == MVT::v128i8 ||
376 SimpleTy == MVT::v64i16 || SimpleTy == MVT::v32i32 ||
377 SimpleTy == MVT::v16i64);
378 }
379
380 /// Return true if this is a 2048-bit vector type.
381 bool is2048BitVector() const {
382 return (SimpleTy == MVT::v256i8 || SimpleTy == MVT::v128i16 ||
383 SimpleTy == MVT::v64i32 || SimpleTy == MVT::v32i64);
384 }
385
386 /// Return true if this is an overloaded type for TableGen.
387 bool isOverloaded() const {
388 return (SimpleTy==MVT::Any ||
389 SimpleTy==MVT::iAny || SimpleTy==MVT::fAny ||
390 SimpleTy==MVT::vAny || SimpleTy==MVT::iPTRAny);
391 }
392
393 /// Return a VT for a vector type with the same element type but
394 /// half the number of elements.
395 MVT getHalfNumVectorElementsVT() const {
396 MVT EltVT = getVectorElementType();
397 auto EltCnt = getVectorElementCount();
398 assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!")((!(EltCnt.Min & 1) && "Splitting vector, but not in half!"
) ? static_cast<void> (0) : __assert_fail ("!(EltCnt.Min & 1) && \"Splitting vector, but not in half!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 398, __PRETTY_FUNCTION__))
;
399 return getVectorVT(EltVT, EltCnt / 2);
400 }
401
402 /// Returns true if the given vector is a power of 2.
403 bool isPow2VectorType() const {
404 unsigned NElts = getVectorNumElements();
405 return !(NElts & (NElts - 1));
406 }
407
408 /// Widens the length of the given vector MVT up to the nearest power of 2
409 /// and returns that type.
410 MVT getPow2VectorType() const {
411 if (isPow2VectorType())
412 return *this;
413
414 unsigned NElts = getVectorNumElements();
415 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
416 return MVT::getVectorVT(getVectorElementType(), Pow2NElts);
417 }
418
419 /// If this is a vector, return the element type, otherwise return this.
420 MVT getScalarType() const {
421 return isVector() ? getVectorElementType() : *this;
422 }
423
424 MVT getVectorElementType() const {
425 switch (SimpleTy) {
426 default:
427 llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 427)
;
428 case v1i1:
429 case v2i1:
430 case v4i1:
431 case v8i1:
432 case v16i1:
433 case v32i1:
434 case v64i1:
435 case v128i1:
436 case v256i1:
437 case v512i1:
438 case v1024i1:
439 case nxv1i1:
440 case nxv2i1:
441 case nxv4i1:
442 case nxv8i1:
443 case nxv16i1:
444 case nxv32i1: return i1;
445 case v1i8:
446 case v2i8:
447 case v4i8:
448 case v8i8:
449 case v16i8:
450 case v32i8:
451 case v64i8:
452 case v128i8:
453 case v256i8:
454 case nxv1i8:
455 case nxv2i8:
456 case nxv4i8:
457 case nxv8i8:
458 case nxv16i8:
459 case nxv32i8: return i8;
460 case v1i16:
461 case v2i16:
462 case v3i16:
463 case v4i16:
464 case v8i16:
465 case v16i16:
466 case v32i16:
467 case v64i16:
468 case v128i16:
469 case nxv1i16:
470 case nxv2i16:
471 case nxv4i16:
472 case nxv8i16:
473 case nxv16i16:
474 case nxv32i16: return i16;
475 case v1i32:
476 case v2i32:
477 case v3i32:
478 case v4i32:
479 case v5i32:
480 case v8i32:
481 case v16i32:
482 case v32i32:
483 case v64i32:
484 case v128i32:
485 case v256i32:
486 case v512i32:
487 case v1024i32:
488 case v2048i32:
489 case nxv1i32:
490 case nxv2i32:
491 case nxv4i32:
492 case nxv8i32:
493 case nxv16i32:
494 case nxv32i32: return i32;
495 case v1i64:
496 case v2i64:
497 case v4i64:
498 case v8i64:
499 case v16i64:
500 case v32i64:
501 case nxv1i64:
502 case nxv2i64:
503 case nxv4i64:
504 case nxv8i64:
505 case nxv16i64:
506 case nxv32i64: return i64;
507 case v1i128: return i128;
508 case v2f16:
509 case v3f16:
510 case v4f16:
511 case v8f16:
512 case v16f16:
513 case v32f16:
514 case nxv2f16:
515 case nxv4f16:
516 case nxv8f16: return f16;
517 case v1f32:
518 case v2f32:
519 case v3f32:
520 case v4f32:
521 case v5f32:
522 case v8f32:
523 case v16f32:
524 case v32f32:
525 case v64f32:
526 case v128f32:
527 case v256f32:
528 case v512f32:
529 case v1024f32:
530 case v2048f32:
531 case nxv1f32:
532 case nxv2f32:
533 case nxv4f32:
534 case nxv8f32:
535 case nxv16f32: return f32;
536 case v1f64:
537 case v2f64:
538 case v4f64:
539 case v8f64:
540 case nxv1f64:
541 case nxv2f64:
542 case nxv4f64:
543 case nxv8f64: return f64;
544 }
545 }
546
547 unsigned getVectorNumElements() const {
548 switch (SimpleTy) {
549 default:
550 llvm_unreachable("Not a vector MVT!")::llvm::llvm_unreachable_internal("Not a vector MVT!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 550)
;
551 case v2048i32:
552 case v2048f32: return 2048;
553 case v1024i1:
554 case v1024i32:
555 case v1024f32: return 1024;
556 case v512i1:
557 case v512i32:
558 case v512f32: return 512;
559 case v256i1:
560 case v256i8:
561 case v256i32:
562 case v256f32: return 256;
563 case v128i1:
564 case v128i8:
565 case v128i16:
566 case v128i32:
567 case v128f32: return 128;
568 case v64i1:
569 case v64i8:
570 case v64i16:
571 case v64i32:
572 case v64f32: return 64;
573 case v32i1:
574 case v32i8:
575 case v32i16:
576 case v32i32:
577 case v32i64:
578 case v32f16:
579 case v32f32:
580 case nxv32i1:
581 case nxv32i8:
582 case nxv32i16:
583 case nxv32i32:
584 case nxv32i64: return 32;
585 case v16i1:
586 case v16i8:
587 case v16i16:
588 case v16i32:
589 case v16i64:
590 case v16f16:
591 case v16f32:
592 case nxv16i1:
593 case nxv16i8:
594 case nxv16i16:
595 case nxv16i32:
596 case nxv16i64:
597 case nxv16f32: return 16;
598 case v8i1:
599 case v8i8:
600 case v8i16:
601 case v8i32:
602 case v8i64:
603 case v8f16:
604 case v8f32:
605 case v8f64:
606 case nxv8i1:
607 case nxv8i8:
608 case nxv8i16:
609 case nxv8i32:
610 case nxv8i64:
611 case nxv8f16:
612 case nxv8f32:
613 case nxv8f64: return 8;
614 case v5i32:
615 case v5f32: return 5;
616 case v4i1:
617 case v4i8:
618 case v4i16:
619 case v4i32:
620 case v4i64:
621 case v4f16:
622 case v4f32:
623 case v4f64:
624 case nxv4i1:
625 case nxv4i8:
626 case nxv4i16:
627 case nxv4i32:
628 case nxv4i64:
629 case nxv4f16:
630 case nxv4f32:
631 case nxv4f64: return 4;
632 case v3i16:
633 case v3i32:
634 case v3f16:
635 case v3f32: return 3;
636 case v2i1:
637 case v2i8:
638 case v2i16:
639 case v2i32:
640 case v2i64:
641 case v2f16:
642 case v2f32:
643 case v2f64:
644 case nxv2i1:
645 case nxv2i8:
646 case nxv2i16:
647 case nxv2i32:
648 case nxv2i64:
649 case nxv2f16:
650 case nxv2f32:
651 case nxv2f64: return 2;
652 case v1i1:
653 case v1i8:
654 case v1i16:
655 case v1i32:
656 case v1i64:
657 case v1i128:
658 case v1f32:
659 case v1f64:
660 case nxv1i1:
661 case nxv1i8:
662 case nxv1i16:
663 case nxv1i32:
664 case nxv1i64:
665 case nxv1f32:
666 case nxv1f64: return 1;
667 }
668 }
669
670 ElementCount getVectorElementCount() const {
671 return { getVectorNumElements(), isScalableVector() };
672 }
673
674 /// Returns the size of the specified MVT in bits.
675 ///
676 /// If the value type is a scalable vector type, the scalable property will
677 /// be set and the runtime size will be a positive integer multiple of the
678 /// base size.
679 TypeSize getSizeInBits() const {
680 switch (SimpleTy) {
681 default:
682 llvm_unreachable("getSizeInBits called on extended MVT.")::llvm::llvm_unreachable_internal("getSizeInBits called on extended MVT."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 682)
;
683 case Other:
684 llvm_unreachable("Value type is non-standard value, Other.")::llvm::llvm_unreachable_internal("Value type is non-standard value, Other."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 684)
;
685 case iPTR:
686 llvm_unreachable("Value type size is target-dependent. Ask TLI.")::llvm::llvm_unreachable_internal("Value type size is target-dependent. Ask TLI."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 686)
;
687 case iPTRAny:
688 case iAny:
689 case fAny:
690 case vAny:
691 case Any:
692 llvm_unreachable("Value type is overloaded.")::llvm::llvm_unreachable_internal("Value type is overloaded."
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 692)
;
693 case token:
694 llvm_unreachable("Token type is a sentinel that cannot be used "::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used "
"in codegen and has no size", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 695)
695 "in codegen and has no size")::llvm::llvm_unreachable_internal("Token type is a sentinel that cannot be used "
"in codegen and has no size", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 695)
;
696 case Metadata:
697 llvm_unreachable("Value type is metadata.")::llvm::llvm_unreachable_internal("Value type is metadata.", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 697)
;
698 case i1:
699 case v1i1: return TypeSize::Fixed(1);
700 case nxv1i1: return TypeSize::Scalable(1);
701 case v2i1: return TypeSize::Fixed(2);
702 case nxv2i1: return TypeSize::Scalable(2);
703 case v4i1: return TypeSize::Fixed(4);
704 case nxv4i1: return TypeSize::Scalable(4);
705 case i8 :
706 case v1i8:
707 case v8i1: return TypeSize::Fixed(8);
708 case nxv1i8:
709 case nxv8i1: return TypeSize::Scalable(8);
710 case i16 :
711 case f16:
712 case v16i1:
713 case v2i8:
714 case v1i16: return TypeSize::Fixed(16);
715 case nxv16i1:
716 case nxv2i8:
717 case nxv1i16: return TypeSize::Scalable(16);
718 case f32 :
719 case i32 :
720 case v32i1:
721 case v4i8:
722 case v2i16:
723 case v2f16:
724 case v1f32:
725 case v1i32: return TypeSize::Fixed(32);
726 case nxv32i1:
727 case nxv4i8:
728 case nxv2i16:
729 case nxv1i32:
730 case nxv2f16:
731 case nxv1f32: return TypeSize::Scalable(32);
732 case v3i16:
733 case v3f16: return TypeSize::Fixed(48);
734 case x86mmx:
735 case f64 :
736 case i64 :
737 case v64i1:
738 case v8i8:
739 case v4i16:
740 case v2i32:
741 case v1i64:
742 case v4f16:
743 case v2f32:
744 case v1f64: return TypeSize::Fixed(64);
745 case nxv8i8:
746 case nxv4i16:
747 case nxv2i32:
748 case nxv1i64:
749 case nxv4f16:
750 case nxv2f32:
751 case nxv1f64: return TypeSize::Scalable(64);
752 case f80 : return TypeSize::Fixed(80);
753 case v3i32:
754 case v3f32: return TypeSize::Fixed(96);
755 case f128:
756 case ppcf128:
757 case i128:
758 case v128i1:
759 case v16i8:
760 case v8i16:
761 case v4i32:
762 case v2i64:
763 case v1i128:
764 case v8f16:
765 case v4f32:
766 case v2f64: return TypeSize::Fixed(128);
767 case nxv16i8:
768 case nxv8i16:
769 case nxv4i32:
770 case nxv2i64:
771 case nxv8f16:
772 case nxv4f32:
773 case nxv2f64: return TypeSize::Scalable(128);
774 case v5i32:
775 case v5f32: return TypeSize::Fixed(160);
776 case v256i1:
777 case v32i8:
778 case v16i16:
779 case v8i32:
780 case v4i64:
781 case v16f16:
782 case v8f32:
783 case v4f64: return TypeSize::Fixed(256);
784 case nxv32i8:
785 case nxv16i16:
786 case nxv8i32:
787 case nxv4i64:
788 case nxv8f32:
789 case nxv4f64: return TypeSize::Scalable(256);
790 case v512i1:
791 case v64i8:
792 case v32i16:
793 case v16i32:
794 case v8i64:
795 case v32f16:
796 case v16f32:
797 case v8f64: return TypeSize::Fixed(512);
798 case nxv32i16:
799 case nxv16i32:
800 case nxv8i64:
801 case nxv16f32:
802 case nxv8f64: return TypeSize::Scalable(512);
803 case v1024i1:
804 case v128i8:
805 case v64i16:
806 case v32i32:
807 case v16i64:
808 case v32f32: return TypeSize::Fixed(1024);
809 case nxv32i32:
810 case nxv16i64: return TypeSize::Scalable(1024);
811 case v256i8:
812 case v128i16:
813 case v64i32:
814 case v32i64:
815 case v64f32: return TypeSize::Fixed(2048);
816 case nxv32i64: return TypeSize::Scalable(2048);
817 case v128i32:
818 case v128f32: return TypeSize::Fixed(4096);
819 case v256i32:
820 case v256f32: return TypeSize::Fixed(8192);
821 case v512i32:
822 case v512f32: return TypeSize::Fixed(16384);
823 case v1024i32:
824 case v1024f32: return TypeSize::Fixed(32768);
825 case v2048i32:
826 case v2048f32: return TypeSize::Fixed(65536);
827 case exnref: return TypeSize::Fixed(0); // opaque type
828 }
829 }
830
831 TypeSize getScalarSizeInBits() const {
832 return getScalarType().getSizeInBits();
833 }
834
835 /// Return the number of bytes overwritten by a store of the specified value
836 /// type.
837 ///
838 /// If the value type is a scalable vector type, the scalable property will
839 /// be set and the runtime size will be a positive integer multiple of the
840 /// base size.
841 TypeSize getStoreSize() const {
842 TypeSize BaseSize = getSizeInBits();
843 return {(BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable()};
844 }
845
846 /// Return the number of bits overwritten by a store of the specified value
847 /// type.
848 ///
849 /// If the value type is a scalable vector type, the scalable property will
850 /// be set and the runtime size will be a positive integer multiple of the
851 /// base size.
852 TypeSize getStoreSizeInBits() const {
853 return getStoreSize() * 8;
854 }
855
856 /// Returns true if the number of bits for the type is a multiple of an
857 /// 8-bit byte.
858 bool isByteSized() const {
859 return getSizeInBits().isByteSized();
860 }
861
862 /// Return true if this has more bits than VT.
863 bool bitsGT(MVT VT) const {
864 return getSizeInBits() > VT.getSizeInBits();
865 }
866
867 /// Return true if this has no less bits than VT.
868 bool bitsGE(MVT VT) const {
869 return getSizeInBits() >= VT.getSizeInBits();
870 }
871
872 /// Return true if this has less bits than VT.
873 bool bitsLT(MVT VT) const {
874 return getSizeInBits() < VT.getSizeInBits();
875 }
876
877 /// Return true if this has no more bits than VT.
878 bool bitsLE(MVT VT) const {
879 return getSizeInBits() <= VT.getSizeInBits();
880 }
881
882 static MVT getFloatingPointVT(unsigned BitWidth) {
883 switch (BitWidth) {
884 default:
885 llvm_unreachable("Bad bit width!")::llvm::llvm_unreachable_internal("Bad bit width!", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 885)
;
886 case 16:
887 return MVT::f16;
888 case 32:
889 return MVT::f32;
890 case 64:
891 return MVT::f64;
892 case 80:
893 return MVT::f80;
894 case 128:
895 return MVT::f128;
896 }
897 }
898
899 static MVT getIntegerVT(unsigned BitWidth) {
900 switch (BitWidth) {
901 default:
902 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
903 case 1:
904 return MVT::i1;
905 case 8:
906 return MVT::i8;
907 case 16:
908 return MVT::i16;
909 case 32:
910 return MVT::i32;
911 case 64:
912 return MVT::i64;
913 case 128:
914 return MVT::i128;
915 }
916 }
917
918 static MVT getVectorVT(MVT VT, unsigned NumElements) {
919 switch (VT.SimpleTy) {
920 default:
921 break;
922 case MVT::i1:
923 if (NumElements == 1) return MVT::v1i1;
924 if (NumElements == 2) return MVT::v2i1;
925 if (NumElements == 4) return MVT::v4i1;
926 if (NumElements == 8) return MVT::v8i1;
927 if (NumElements == 16) return MVT::v16i1;
928 if (NumElements == 32) return MVT::v32i1;
929 if (NumElements == 64) return MVT::v64i1;
930 if (NumElements == 128) return MVT::v128i1;
931 if (NumElements == 256) return MVT::v256i1;
932 if (NumElements == 512) return MVT::v512i1;
933 if (NumElements == 1024) return MVT::v1024i1;
934 break;
935 case MVT::i8:
936 if (NumElements == 1) return MVT::v1i8;
937 if (NumElements == 2) return MVT::v2i8;
938 if (NumElements == 4) return MVT::v4i8;
939 if (NumElements == 8) return MVT::v8i8;
940 if (NumElements == 16) return MVT::v16i8;
941 if (NumElements == 32) return MVT::v32i8;
942 if (NumElements == 64) return MVT::v64i8;
943 if (NumElements == 128) return MVT::v128i8;
944 if (NumElements == 256) return MVT::v256i8;
945 break;
946 case MVT::i16:
947 if (NumElements == 1) return MVT::v1i16;
948 if (NumElements == 2) return MVT::v2i16;
949 if (NumElements == 3) return MVT::v3i16;
950 if (NumElements == 4) return MVT::v4i16;
951 if (NumElements == 8) return MVT::v8i16;
952 if (NumElements == 16) return MVT::v16i16;
953 if (NumElements == 32) return MVT::v32i16;
954 if (NumElements == 64) return MVT::v64i16;
955 if (NumElements == 128) return MVT::v128i16;
956 break;
957 case MVT::i32:
958 if (NumElements == 1) return MVT::v1i32;
959 if (NumElements == 2) return MVT::v2i32;
960 if (NumElements == 3) return MVT::v3i32;
961 if (NumElements == 4) return MVT::v4i32;
962 if (NumElements == 5) return MVT::v5i32;
963 if (NumElements == 8) return MVT::v8i32;
964 if (NumElements == 16) return MVT::v16i32;
965 if (NumElements == 32) return MVT::v32i32;
966 if (NumElements == 64) return MVT::v64i32;
967 if (NumElements == 128) return MVT::v128i32;
968 if (NumElements == 256) return MVT::v256i32;
969 if (NumElements == 512) return MVT::v512i32;
970 if (NumElements == 1024) return MVT::v1024i32;
971 if (NumElements == 2048) return MVT::v2048i32;
972 break;
973 case MVT::i64:
974 if (NumElements == 1) return MVT::v1i64;
975 if (NumElements == 2) return MVT::v2i64;
976 if (NumElements == 4) return MVT::v4i64;
977 if (NumElements == 8) return MVT::v8i64;
978 if (NumElements == 16) return MVT::v16i64;
979 if (NumElements == 32) return MVT::v32i64;
980 break;
981 case MVT::i128:
982 if (NumElements == 1) return MVT::v1i128;
983 break;
984 case MVT::f16:
985 if (NumElements == 2) return MVT::v2f16;
986 if (NumElements == 3) return MVT::v3f16;
987 if (NumElements == 4) return MVT::v4f16;
988 if (NumElements == 8) return MVT::v8f16;
989 if (NumElements == 16) return MVT::v16f16;
990 if (NumElements == 32) return MVT::v32f16;
991 break;
992 case MVT::f32:
993 if (NumElements == 1) return MVT::v1f32;
994 if (NumElements == 2) return MVT::v2f32;
995 if (NumElements == 3) return MVT::v3f32;
996 if (NumElements == 4) return MVT::v4f32;
997 if (NumElements == 5) return MVT::v5f32;
998 if (NumElements == 8) return MVT::v8f32;
999 if (NumElements == 16) return MVT::v16f32;
1000 if (NumElements == 32) return MVT::v32f32;
1001 if (NumElements == 64) return MVT::v64f32;
1002 if (NumElements == 128) return MVT::v128f32;
1003 if (NumElements == 256) return MVT::v256f32;
1004 if (NumElements == 512) return MVT::v512f32;
1005 if (NumElements == 1024) return MVT::v1024f32;
1006 if (NumElements == 2048) return MVT::v2048f32;
1007 break;
1008 case MVT::f64:
1009 if (NumElements == 1) return MVT::v1f64;
1010 if (NumElements == 2) return MVT::v2f64;
1011 if (NumElements == 4) return MVT::v4f64;
1012 if (NumElements == 8) return MVT::v8f64;
1013 break;
1014 }
1015 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1016 }
1017
1018 static MVT getScalableVectorVT(MVT VT, unsigned NumElements) {
1019 switch(VT.SimpleTy) {
1020 default:
1021 break;
1022 case MVT::i1:
1023 if (NumElements == 1) return MVT::nxv1i1;
1024 if (NumElements == 2) return MVT::nxv2i1;
1025 if (NumElements == 4) return MVT::nxv4i1;
1026 if (NumElements == 8) return MVT::nxv8i1;
1027 if (NumElements == 16) return MVT::nxv16i1;
1028 if (NumElements == 32) return MVT::nxv32i1;
1029 break;
1030 case MVT::i8:
1031 if (NumElements == 1) return MVT::nxv1i8;
1032 if (NumElements == 2) return MVT::nxv2i8;
1033 if (NumElements == 4) return MVT::nxv4i8;
1034 if (NumElements == 8) return MVT::nxv8i8;
1035 if (NumElements == 16) return MVT::nxv16i8;
1036 if (NumElements == 32) return MVT::nxv32i8;
1037 break;
1038 case MVT::i16:
1039 if (NumElements == 1) return MVT::nxv1i16;
1040 if (NumElements == 2) return MVT::nxv2i16;
1041 if (NumElements == 4) return MVT::nxv4i16;
1042 if (NumElements == 8) return MVT::nxv8i16;
1043 if (NumElements == 16) return MVT::nxv16i16;
1044 if (NumElements == 32) return MVT::nxv32i16;
1045 break;
1046 case MVT::i32:
1047 if (NumElements == 1) return MVT::nxv1i32;
1048 if (NumElements == 2) return MVT::nxv2i32;
1049 if (NumElements == 4) return MVT::nxv4i32;
1050 if (NumElements == 8) return MVT::nxv8i32;
1051 if (NumElements == 16) return MVT::nxv16i32;
1052 if (NumElements == 32) return MVT::nxv32i32;
1053 break;
1054 case MVT::i64:
1055 if (NumElements == 1) return MVT::nxv1i64;
1056 if (NumElements == 2) return MVT::nxv2i64;
1057 if (NumElements == 4) return MVT::nxv4i64;
1058 if (NumElements == 8) return MVT::nxv8i64;
1059 if (NumElements == 16) return MVT::nxv16i64;
1060 if (NumElements == 32) return MVT::nxv32i64;
1061 break;
1062 case MVT::f16:
1063 if (NumElements == 2) return MVT::nxv2f16;
1064 if (NumElements == 4) return MVT::nxv4f16;
1065 if (NumElements == 8) return MVT::nxv8f16;
1066 break;
1067 case MVT::f32:
1068 if (NumElements == 1) return MVT::nxv1f32;
1069 if (NumElements == 2) return MVT::nxv2f32;
1070 if (NumElements == 4) return MVT::nxv4f32;
1071 if (NumElements == 8) return MVT::nxv8f32;
1072 if (NumElements == 16) return MVT::nxv16f32;
1073 break;
1074 case MVT::f64:
1075 if (NumElements == 1) return MVT::nxv1f64;
1076 if (NumElements == 2) return MVT::nxv2f64;
1077 if (NumElements == 4) return MVT::nxv4f64;
1078 if (NumElements == 8) return MVT::nxv8f64;
1079 break;
1080 }
1081 return (MVT::SimpleValueType)(MVT::INVALID_SIMPLE_VALUE_TYPE);
1082 }
1083
1084 static MVT getVectorVT(MVT VT, unsigned NumElements, bool IsScalable) {
1085 if (IsScalable)
1086 return getScalableVectorVT(VT, NumElements);
1087 return getVectorVT(VT, NumElements);
1088 }
1089
1090 static MVT getVectorVT(MVT VT, ElementCount EC) {
1091 if (EC.Scalable)
1092 return getScalableVectorVT(VT, EC.Min);
1093 return getVectorVT(VT, EC.Min);
1094 }
1095
1096 /// Return the value type corresponding to the specified type. This returns
1097 /// all pointers as iPTR. If HandleUnknown is true, unknown types are
1098 /// returned as Other, otherwise they are invalid.
1099 static MVT getVT(Type *Ty, bool HandleUnknown = false);
1100
1101 private:
1102 /// A simple iterator over the MVT::SimpleValueType enum.
1103 struct mvt_iterator {
1104 SimpleValueType VT;
1105
1106 mvt_iterator(SimpleValueType VT) : VT(VT) {}
1107
1108 MVT operator*() const { return VT; }
1109 bool operator!=(const mvt_iterator &LHS) const { return VT != LHS.VT; }
1110
1111 mvt_iterator& operator++() {
1112 VT = (MVT::SimpleValueType)((int)VT + 1);
1113 assert((int)VT <= MVT::MAX_ALLOWED_VALUETYPE &&(((int)VT <= MVT::MAX_ALLOWED_VALUETYPE && "MVT iterator overflowed."
) ? static_cast<void> (0) : __assert_fail ("(int)VT <= MVT::MAX_ALLOWED_VALUETYPE && \"MVT iterator overflowed.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 1114, __PRETTY_FUNCTION__))
1114 "MVT iterator overflowed.")(((int)VT <= MVT::MAX_ALLOWED_VALUETYPE && "MVT iterator overflowed."
) ? static_cast<void> (0) : __assert_fail ("(int)VT <= MVT::MAX_ALLOWED_VALUETYPE && \"MVT iterator overflowed.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/Support/MachineValueType.h"
, 1114, __PRETTY_FUNCTION__))
;
1115 return *this;
1116 }
1117 };
1118
1119 /// A range of the MVT::SimpleValueType enum.
1120 using mvt_range = iterator_range<mvt_iterator>;
1121
1122 public:
1123 /// SimpleValueType Iteration
1124 /// @{
1125 static mvt_range all_valuetypes() {
1126 return mvt_range(MVT::FIRST_VALUETYPE, MVT::LAST_VALUETYPE);
1127 }
1128
1129 static mvt_range integer_valuetypes() {
1130 return mvt_range(MVT::FIRST_INTEGER_VALUETYPE,
1131 (MVT::SimpleValueType)(MVT::LAST_INTEGER_VALUETYPE + 1));
1132 }
1133
1134 static mvt_range fp_valuetypes() {
1135 return mvt_range(MVT::FIRST_FP_VALUETYPE,
1136 (MVT::SimpleValueType)(MVT::LAST_FP_VALUETYPE + 1));
1137 }
1138
1139 static mvt_range vector_valuetypes() {
1140 return mvt_range(MVT::FIRST_VECTOR_VALUETYPE,
1141 (MVT::SimpleValueType)(MVT::LAST_VECTOR_VALUETYPE + 1));
1142 }
1143
1144 static mvt_range fixedlen_vector_valuetypes() {
1145 return mvt_range(
1146 MVT::FIRST_FIXEDLEN_VECTOR_VALUETYPE,
1147 (MVT::SimpleValueType)(MVT::LAST_FIXEDLEN_VECTOR_VALUETYPE + 1));
1148 }
1149
1150 static mvt_range scalable_vector_valuetypes() {
1151 return mvt_range(
1152 MVT::FIRST_SCALABLE_VECTOR_VALUETYPE,
1153 (MVT::SimpleValueType)(MVT::LAST_SCALABLE_VECTOR_VALUETYPE + 1));
1154 }
1155
1156 static mvt_range integer_fixedlen_vector_valuetypes() {
1157 return mvt_range(
1158 MVT::FIRST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE,
1159 (MVT::SimpleValueType)(MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE + 1));
1160 }
1161
1162 static mvt_range fp_fixedlen_vector_valuetypes() {
1163 return mvt_range(
1164 MVT::FIRST_FP_FIXEDLEN_VECTOR_VALUETYPE,
1165 (MVT::SimpleValueType)(MVT::LAST_FP_FIXEDLEN_VECTOR_VALUETYPE + 1));
1166 }
1167
1168 static mvt_range integer_scalable_vector_valuetypes() {
1169 return mvt_range(
1170 MVT::FIRST_INTEGER_SCALABLE_VECTOR_VALUETYPE,
1171 (MVT::SimpleValueType)(MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE + 1));
1172 }
1173
1174 static mvt_range fp_scalable_vector_valuetypes() {
1175 return mvt_range(
1176 MVT::FIRST_FP_SCALABLE_VECTOR_VALUETYPE,
1177 (MVT::SimpleValueType)(MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE + 1));
1178 }
1179 /// @}
1180 };
1181
1182} // end namespace llvm
1183
1184#endif // LLVM_CODEGEN_MACHINEVALUETYPE_H

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h

1//===- CodeGen/ValueTypes.h - Low-Level Target independ. types --*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the set of low-level target independent types which various
10// values in the code generator are. This allows the target specific behavior
11// of instructions to be described to target independent passes.
12//
13//===----------------------------------------------------------------------===//
14
15#ifndef LLVM_CODEGEN_VALUETYPES_H
16#define LLVM_CODEGEN_VALUETYPES_H
17
18#include "llvm/Support/Compiler.h"
19#include "llvm/Support/MachineValueType.h"
20#include "llvm/Support/MathExtras.h"
21#include "llvm/Support/TypeSize.h"
22#include <cassert>
23#include <cstdint>
24#include <string>
25
26namespace llvm {
27
28 class LLVMContext;
29 class Type;
30
31 /// Extended Value Type. Capable of holding value types which are not native
32 /// for any processor (such as the i12345 type), as well as the types an MVT
33 /// can represent.
34 struct EVT {
35 private:
36 MVT V = MVT::INVALID_SIMPLE_VALUE_TYPE;
37 Type *LLVMTy = nullptr;
38
39 public:
40 constexpr EVT() = default;
41 constexpr EVT(MVT::SimpleValueType SVT) : V(SVT) {}
42 constexpr EVT(MVT S) : V(S) {}
43
44 bool operator==(EVT VT) const {
45 return !(*this != VT);
46 }
47 bool operator!=(EVT VT) const {
48 if (V.SimpleTy != VT.V.SimpleTy)
49 return true;
50 if (V.SimpleTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
51 return LLVMTy != VT.LLVMTy;
52 return false;
53 }
54
55 /// Returns the EVT that represents a floating-point type with the given
56 /// number of bits. There are two floating-point types with 128 bits - this
57 /// returns f128 rather than ppcf128.
58 static EVT getFloatingPointVT(unsigned BitWidth) {
59 return MVT::getFloatingPointVT(BitWidth);
60 }
61
62 /// Returns the EVT that represents an integer with the given number of
63 /// bits.
64 static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth) {
65 MVT M = MVT::getIntegerVT(BitWidth);
66 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
67 return M;
68 return getExtendedIntegerVT(Context, BitWidth);
69 }
70
71 /// Returns the EVT that represents a vector NumElements in length, where
72 /// each element is of type VT.
73 static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements,
74 bool IsScalable = false) {
75 MVT M = MVT::getVectorVT(VT.V, NumElements, IsScalable);
76 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
77 return M;
78
79 assert(!IsScalable && "We don't support extended scalable types yet")((!IsScalable && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!IsScalable && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 79, __PRETTY_FUNCTION__))
;
80 return getExtendedVectorVT(Context, VT, NumElements);
81 }
82
83 /// Returns the EVT that represents a vector EC.Min elements in length,
84 /// where each element is of type VT.
85 static EVT getVectorVT(LLVMContext &Context, EVT VT, ElementCount EC) {
86 MVT M = MVT::getVectorVT(VT.V, EC);
87 if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE)
88 return M;
89 assert (!EC.Scalable && "We don't support extended scalable types yet")((!EC.Scalable && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!EC.Scalable && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 89, __PRETTY_FUNCTION__))
;
90 return getExtendedVectorVT(Context, VT, EC.Min);
91 }
92
93 /// Return a vector with the same number of elements as this vector, but
94 /// with the element type converted to an integer type with the same
95 /// bitwidth.
96 EVT changeVectorElementTypeToInteger() const {
97 if (!isSimple()) {
98 assert (!isScalableVector() &&((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 99, __PRETTY_FUNCTION__))
99 "We don't support extended scalable types yet")((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 99, __PRETTY_FUNCTION__))
;
100 return changeExtendedVectorElementTypeToInteger();
101 }
102 MVT EltTy = getSimpleVT().getVectorElementType();
103 unsigned BitWidth = EltTy.getSizeInBits();
104 MVT IntTy = MVT::getIntegerVT(BitWidth);
105 MVT VecTy = MVT::getVectorVT(IntTy, getVectorNumElements(),
106 isScalableVector());
107 assert(VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&((VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
"Simple vector VT not representable by simple integer vector VT!"
) ? static_cast<void> (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 108, __PRETTY_FUNCTION__))
108 "Simple vector VT not representable by simple integer vector VT!")((VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE &&
"Simple vector VT not representable by simple integer vector VT!"
) ? static_cast<void> (0) : __assert_fail ("VecTy.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE && \"Simple vector VT not representable by simple integer vector VT!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 108, __PRETTY_FUNCTION__))
;
109 return VecTy;
110 }
111
112 /// Return the type converted to an equivalently sized integer or vector
113 /// with integer element type. Similar to changeVectorElementTypeToInteger,
114 /// but also handles scalars.
115 EVT changeTypeToInteger() {
116 if (isVector())
117 return changeVectorElementTypeToInteger();
118
119 if (isSimple())
120 return MVT::getIntegerVT(getSizeInBits());
121
122 return changeExtendedTypeToInteger();
123 }
124
125 /// Test if the given EVT is simple (as opposed to being extended).
126 bool isSimple() const {
127 return V.SimpleTy
15.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
15.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
15.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
15.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
15.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
15.1
Field 'SimpleTy' is not equal to INVALID_SIMPLE_VALUE_TYPE
!= MVT::INVALID_SIMPLE_VALUE_TYPE
;
16
Returning the value 1, which participates in a condition later
128 }
129
130 /// Test if the given EVT is extended (as opposed to being simple).
131 bool isExtended() const {
132 return !isSimple();
133 }
134
135 /// Return true if this is a FP or a vector FP type.
136 bool isFloatingPoint() const {
137 return isSimple() ? V.isFloatingPoint() : isExtendedFloatingPoint();
138 }
139
140 /// Return true if this is an integer or a vector integer type.
141 bool isInteger() const {
142 return isSimple() ? V.isInteger() : isExtendedInteger();
143 }
144
145 /// Return true if this is an integer, but not a vector.
146 bool isScalarInteger() const {
147 return isSimple() ? V.isScalarInteger() : isExtendedScalarInteger();
148 }
149
150 /// Return true if this is a vector value type.
151 bool isVector() const {
152 return isSimple() ? V.isVector() : isExtendedVector();
19
'?' condition is true
20
Calling 'MVT::isVector'
22
Returning from 'MVT::isVector'
23
Returning the value 1, which participates in a condition later
153 }
154
155 /// Return true if this is a vector type where the runtime
156 /// length is machine dependent
157 bool isScalableVector() const {
158 // FIXME: We don't support extended scalable types yet, because the
159 // matching IR type doesn't exist. Once it has been added, this can
160 // be changed to call isExtendedScalableVector.
161 if (!isSimple())
162 return false;
163 return V.isScalableVector();
164 }
165
166 /// Return true if this is a 16-bit vector type.
167 bool is16BitVector() const {
168 return isSimple() ? V.is16BitVector() : isExtended16BitVector();
169 }
170
171 /// Return true if this is a 32-bit vector type.
172 bool is32BitVector() const {
173 return isSimple() ? V.is32BitVector() : isExtended32BitVector();
174 }
175
176 /// Return true if this is a 64-bit vector type.
177 bool is64BitVector() const {
178 return isSimple() ? V.is64BitVector() : isExtended64BitVector();
179 }
180
181 /// Return true if this is a 128-bit vector type.
182 bool is128BitVector() const {
183 return isSimple() ? V.is128BitVector() : isExtended128BitVector();
184 }
185
186 /// Return true if this is a 256-bit vector type.
187 bool is256BitVector() const {
188 return isSimple() ? V.is256BitVector() : isExtended256BitVector();
189 }
190
191 /// Return true if this is a 512-bit vector type.
192 bool is512BitVector() const {
193 return isSimple() ? V.is512BitVector() : isExtended512BitVector();
194 }
195
196 /// Return true if this is a 1024-bit vector type.
197 bool is1024BitVector() const {
198 return isSimple() ? V.is1024BitVector() : isExtended1024BitVector();
199 }
200
201 /// Return true if this is a 2048-bit vector type.
202 bool is2048BitVector() const {
203 return isSimple() ? V.is2048BitVector() : isExtended2048BitVector();
204 }
205
206 /// Return true if this is an overloaded type for TableGen.
207 bool isOverloaded() const {
208 return (V==MVT::iAny || V==MVT::fAny || V==MVT::vAny || V==MVT::iPTRAny);
209 }
210
211 /// Return true if the bit size is a multiple of 8.
212 bool isByteSized() const {
213 return getSizeInBits().isByteSized();
214 }
215
216 /// Return true if the size is a power-of-two number of bytes.
217 bool isRound() const {
218 if (isScalableVector())
219 return false;
220 unsigned BitSize = getSizeInBits();
221 return BitSize >= 8 && !(BitSize & (BitSize - 1));
222 }
223
224 /// Return true if this has the same number of bits as VT.
225 bool bitsEq(EVT VT) const {
226 if (EVT::operator==(VT)) return true;
227 return getSizeInBits() == VT.getSizeInBits();
228 }
229
230 /// Return true if this has more bits than VT.
231 bool bitsGT(EVT VT) const {
232 if (EVT::operator==(VT)) return false;
233 return getSizeInBits() > VT.getSizeInBits();
234 }
235
236 /// Return true if this has no less bits than VT.
237 bool bitsGE(EVT VT) const {
238 if (EVT::operator==(VT)) return true;
239 return getSizeInBits() >= VT.getSizeInBits();
240 }
241
242 /// Return true if this has less bits than VT.
243 bool bitsLT(EVT VT) const {
244 if (EVT::operator==(VT)) return false;
245 return getSizeInBits() < VT.getSizeInBits();
246 }
247
248 /// Return true if this has no more bits than VT.
249 bool bitsLE(EVT VT) const {
250 if (EVT::operator==(VT)) return true;
251 return getSizeInBits() <= VT.getSizeInBits();
252 }
253
254 /// Return the SimpleValueType held in the specified simple EVT.
255 MVT getSimpleVT() const {
256 assert(isSimple() && "Expected a SimpleValueType!")((isSimple() && "Expected a SimpleValueType!") ? static_cast
<void> (0) : __assert_fail ("isSimple() && \"Expected a SimpleValueType!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 256, __PRETTY_FUNCTION__))
;
257 return V;
258 }
259
260 /// If this is a vector type, return the element type, otherwise return
261 /// this.
262 EVT getScalarType() const {
263 return isVector() ? getVectorElementType() : *this;
264 }
265
266 /// Given a vector type, return the type of each element.
267 EVT getVectorElementType() const {
268 assert(isVector() && "Invalid vector type!")((isVector() && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("isVector() && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 268, __PRETTY_FUNCTION__))
;
269 if (isSimple())
270 return V.getVectorElementType();
271 return getExtendedVectorElementType();
272 }
273
274 /// Given a vector type, return the number of elements it contains.
275 unsigned getVectorNumElements() const {
276 assert(isVector() && "Invalid vector type!")((isVector() && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("isVector() && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 276, __PRETTY_FUNCTION__))
;
277 if (isSimple())
278 return V.getVectorNumElements();
279 return getExtendedVectorNumElements();
280 }
281
282 // Given a (possibly scalable) vector type, return the ElementCount
283 ElementCount getVectorElementCount() const {
284 assert((isVector()) && "Invalid vector type!")(((isVector()) && "Invalid vector type!") ? static_cast
<void> (0) : __assert_fail ("(isVector()) && \"Invalid vector type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 284, __PRETTY_FUNCTION__))
;
285 if (isSimple())
286 return V.getVectorElementCount();
287
288 assert(!isScalableVector() &&((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 289, __PRETTY_FUNCTION__))
289 "We don't support extended scalable types yet")((!isScalableVector() && "We don't support extended scalable types yet"
) ? static_cast<void> (0) : __assert_fail ("!isScalableVector() && \"We don't support extended scalable types yet\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 289, __PRETTY_FUNCTION__))
;
290 return {getExtendedVectorNumElements(), false};
291 }
292
293 /// Return the size of the specified value type in bits.
294 ///
295 /// If the value type is a scalable vector type, the scalable property will
296 /// be set and the runtime size will be a positive integer multiple of the
297 /// base size.
298 TypeSize getSizeInBits() const {
299 if (isSimple())
300 return V.getSizeInBits();
301 return getExtendedSizeInBits();
302 }
303
304 TypeSize getScalarSizeInBits() const {
305 return getScalarType().getSizeInBits();
306 }
307
308 /// Return the number of bytes overwritten by a store of the specified value
309 /// type.
310 ///
311 /// If the value type is a scalable vector type, the scalable property will
312 /// be set and the runtime size will be a positive integer multiple of the
313 /// base size.
314 TypeSize getStoreSize() const {
315 TypeSize BaseSize = getSizeInBits();
316 return {(BaseSize.getKnownMinSize() + 7) / 8, BaseSize.isScalable()};
317 }
318
319 /// Return the number of bits overwritten by a store of the specified value
320 /// type.
321 ///
322 /// If the value type is a scalable vector type, the scalable property will
323 /// be set and the runtime size will be a positive integer multiple of the
324 /// base size.
325 TypeSize getStoreSizeInBits() const {
326 return getStoreSize() * 8;
327 }
328
329 /// Rounds the bit-width of the given integer EVT up to the nearest power of
330 /// two (and at least to eight), and returns the integer EVT with that
331 /// number of bits.
332 EVT getRoundIntegerType(LLVMContext &Context) const {
333 assert(isInteger() && !isVector() && "Invalid integer type!")((isInteger() && !isVector() && "Invalid integer type!"
) ? static_cast<void> (0) : __assert_fail ("isInteger() && !isVector() && \"Invalid integer type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 333, __PRETTY_FUNCTION__))
;
334 unsigned BitWidth = getSizeInBits();
335 if (BitWidth <= 8)
336 return EVT(MVT::i8);
337 return getIntegerVT(Context, 1 << Log2_32_Ceil(BitWidth));
338 }
339
340 /// Finds the smallest simple value type that is greater than or equal to
341 /// half the width of this EVT. If no simple value type can be found, an
342 /// extended integer value type of half the size (rounded up) is returned.
343 EVT getHalfSizedIntegerVT(LLVMContext &Context) const {
344 assert(isInteger() && !isVector() && "Invalid integer type!")((isInteger() && !isVector() && "Invalid integer type!"
) ? static_cast<void> (0) : __assert_fail ("isInteger() && !isVector() && \"Invalid integer type!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 344, __PRETTY_FUNCTION__))
;
345 unsigned EVTSize = getSizeInBits();
346 for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
347 IntVT <= MVT::LAST_INTEGER_VALUETYPE; ++IntVT) {
348 EVT HalfVT = EVT((MVT::SimpleValueType)IntVT);
349 if (HalfVT.getSizeInBits() * 2 >= EVTSize)
350 return HalfVT;
351 }
352 return getIntegerVT(Context, (EVTSize + 1) / 2);
353 }
354
355 /// Return a VT for an integer vector type with the size of the
356 /// elements doubled. The typed returned may be an extended type.
357 EVT widenIntegerVectorElementType(LLVMContext &Context) const {
358 EVT EltVT = getVectorElementType();
359 EltVT = EVT::getIntegerVT(Context, 2 * EltVT.getSizeInBits());
360 return EVT::getVectorVT(Context, EltVT, getVectorElementCount());
361 }
362
363 // Return a VT for a vector type with the same element type but
364 // half the number of elements. The type returned may be an
365 // extended type.
366 EVT getHalfNumVectorElementsVT(LLVMContext &Context) const {
367 EVT EltVT = getVectorElementType();
368 auto EltCnt = getVectorElementCount();
369 assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!")((!(EltCnt.Min & 1) && "Splitting vector, but not in half!"
) ? static_cast<void> (0) : __assert_fail ("!(EltCnt.Min & 1) && \"Splitting vector, but not in half!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/CodeGen/ValueTypes.h"
, 369, __PRETTY_FUNCTION__))
;
370 return EVT::getVectorVT(Context, EltVT, EltCnt / 2);
371 }
372
373 /// Returns true if the given vector is a power of 2.
374 bool isPow2VectorType() const {
375 unsigned NElts = getVectorNumElements();
376 return !(NElts & (NElts - 1));
377 }
378
379 /// Widens the length of the given vector EVT up to the nearest power of 2
380 /// and returns that type.
381 EVT getPow2VectorType(LLVMContext &Context) const {
382 if (!isPow2VectorType()) {
383 unsigned NElts = getVectorNumElements();
384 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
385 return EVT::getVectorVT(Context, getVectorElementType(), Pow2NElts,
386 isScalableVector());
387 }
388 else {
389 return *this;
390 }
391 }
392
393 /// This function returns value type as a string, e.g. "i32".
394 std::string getEVTString() const;
395
396 /// This method returns an LLVM type corresponding to the specified EVT.
397 /// For integer types, this returns an unsigned type. Note that this will
398 /// abort for types that cannot be represented.
399 Type *getTypeForEVT(LLVMContext &Context) const;
400
401 /// Return the value type corresponding to the specified type.
402 /// This returns all pointers as iPTR. If HandleUnknown is true, unknown
403 /// types are returned as Other, otherwise they are invalid.
404 static EVT getEVT(Type *Ty, bool HandleUnknown = false);
405
406 intptr_t getRawBits() const {
407 if (isSimple())
408 return V.SimpleTy;
409 else
410 return (intptr_t)(LLVMTy);
411 }
412
413 /// A meaningless but well-behaved order, useful for constructing
414 /// containers.
415 struct compareRawBits {
416 bool operator()(EVT L, EVT R) const {
417 if (L.V.SimpleTy == R.V.SimpleTy)
418 return L.LLVMTy < R.LLVMTy;
419 else
420 return L.V.SimpleTy < R.V.SimpleTy;
421 }
422 };
423
424 private:
425 // Methods for handling the Extended-type case in functions above.
426 // These are all out-of-line to prevent users of this header file
427 // from having a dependency on Type.h.
428 EVT changeExtendedTypeToInteger() const;
429 EVT changeExtendedVectorElementTypeToInteger() const;
430 static EVT getExtendedIntegerVT(LLVMContext &C, unsigned BitWidth);
431 static EVT getExtendedVectorVT(LLVMContext &C, EVT VT,
432 unsigned NumElements);
433 bool isExtendedFloatingPoint() const LLVM_READONLY__attribute__((__pure__));
434 bool isExtendedInteger() const LLVM_READONLY__attribute__((__pure__));
435 bool isExtendedScalarInteger() const LLVM_READONLY__attribute__((__pure__));
436 bool isExtendedVector() const LLVM_READONLY__attribute__((__pure__));
437 bool isExtended16BitVector() const LLVM_READONLY__attribute__((__pure__));
438 bool isExtended32BitVector() const LLVM_READONLY__attribute__((__pure__));
439 bool isExtended64BitVector() const LLVM_READONLY__attribute__((__pure__));
440 bool isExtended128BitVector() const LLVM_READONLY__attribute__((__pure__));
441 bool isExtended256BitVector() const LLVM_READONLY__attribute__((__pure__));
442 bool isExtended512BitVector() const LLVM_READONLY__attribute__((__pure__));
443 bool isExtended1024BitVector() const LLVM_READONLY__attribute__((__pure__));
444 bool isExtended2048BitVector() const LLVM_READONLY__attribute__((__pure__));
445 EVT getExtendedVectorElementType() const;
446 unsigned getExtendedVectorNumElements() const LLVM_READONLY__attribute__((__pure__));
447 TypeSize getExtendedSizeInBits() const LLVM_READONLY__attribute__((__pure__));
448 };
449
450} // end namespace llvm
451
452#endif // LLVM_CODEGEN_VALUETYPES_H

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/STLExtras.h

1//===- llvm/ADT/STLExtras.h - Useful STL related functions ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains some templates that are useful if you are working with the
10// STL at all.
11//
12// No library is required when using these functions.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_ADT_STLEXTRAS_H
17#define LLVM_ADT_STLEXTRAS_H
18
19#include "llvm/ADT/Optional.h"
20#include "llvm/ADT/iterator.h"
21#include "llvm/ADT/iterator_range.h"
22#include "llvm/Config/abi-breaking.h"
23#include "llvm/Support/ErrorHandling.h"
24#include <algorithm>
25#include <cassert>
26#include <cstddef>
27#include <cstdint>
28#include <cstdlib>
29#include <functional>
30#include <initializer_list>
31#include <iterator>
32#include <limits>
33#include <memory>
34#include <tuple>
35#include <type_traits>
36#include <utility>
37
38#ifdef EXPENSIVE_CHECKS
39#include <random> // for std::mt19937
40#endif
41
42namespace llvm {
43
44// Only used by compiler if both template types are the same. Useful when
45// using SFINAE to test for the existence of member functions.
46template <typename T, T> struct SameType;
47
48namespace detail {
49
50template <typename RangeT>
51using IterOfRange = decltype(std::begin(std::declval<RangeT &>()));
52
53} // end namespace detail
54
55//===----------------------------------------------------------------------===//
56// Extra additions to <type_traits>
57//===----------------------------------------------------------------------===//
58
59template <typename T>
60struct negation : std::integral_constant<bool, !bool(T::value)> {};
61
62template <typename...> struct conjunction : std::true_type {};
63template <typename B1> struct conjunction<B1> : B1 {};
64template <typename B1, typename... Bn>
65struct conjunction<B1, Bn...>
66 : std::conditional<bool(B1::value), conjunction<Bn...>, B1>::type {};
67
68template <typename T> struct make_const_ptr {
69 using type =
70 typename std::add_pointer<typename std::add_const<T>::type>::type;
71};
72
73template <typename T> struct make_const_ref {
74 using type = typename std::add_lvalue_reference<
75 typename std::add_const<T>::type>::type;
76};
77
78//===----------------------------------------------------------------------===//
79// Extra additions to <functional>
80//===----------------------------------------------------------------------===//
81
82template <class Ty> struct identity {
83 using argument_type = Ty;
84
85 Ty &operator()(Ty &self) const {
86 return self;
87 }
88 const Ty &operator()(const Ty &self) const {
89 return self;
90 }
91};
92
93/// An efficient, type-erasing, non-owning reference to a callable. This is
94/// intended for use as the type of a function parameter that is not used
95/// after the function in question returns.
96///
97/// This class does not own the callable, so it is not in general safe to store
98/// a function_ref.
99template<typename Fn> class function_ref;
100
101template<typename Ret, typename ...Params>
102class function_ref<Ret(Params...)> {
103 Ret (*callback)(intptr_t callable, Params ...params) = nullptr;
104 intptr_t callable;
105
106 template<typename Callable>
107 static Ret callback_fn(intptr_t callable, Params ...params) {
108 return (*reinterpret_cast<Callable*>(callable))(
109 std::forward<Params>(params)...);
110 }
111
112public:
113 function_ref() = default;
114 function_ref(std::nullptr_t) {}
115
116 template <typename Callable>
117 function_ref(Callable &&callable,
118 std::enable_if_t<!std::is_same<std::remove_reference_t<Callable>,
119 function_ref>::value> * = nullptr)
120 : callback(callback_fn<typename std::remove_reference<Callable>::type>),
121 callable(reinterpret_cast<intptr_t>(&callable)) {}
122
123 Ret operator()(Params ...params) const {
124 return callback(callable, std::forward<Params>(params)...);
125 }
126
127 operator bool() const { return callback; }
128};
129
130// deleter - Very very very simple method that is used to invoke operator
131// delete on something. It is used like this:
132//
133// for_each(V.begin(), B.end(), deleter<Interval>);
134template <class T>
135inline void deleter(T *Ptr) {
136 delete Ptr;
137}
138
139//===----------------------------------------------------------------------===//
140// Extra additions to <iterator>
141//===----------------------------------------------------------------------===//
142
143namespace adl_detail {
144
145using std::begin;
146
147template <typename ContainerTy>
148decltype(auto) adl_begin(ContainerTy &&container) {
149 return begin(std::forward<ContainerTy>(container));
150}
151
152using std::end;
153
154template <typename ContainerTy>
155decltype(auto) adl_end(ContainerTy &&container) {
156 return end(std::forward<ContainerTy>(container));
157}
158
159using std::swap;
160
161template <typename T>
162void adl_swap(T &&lhs, T &&rhs) noexcept(noexcept(swap(std::declval<T>(),
163 std::declval<T>()))) {
164 swap(std::forward<T>(lhs), std::forward<T>(rhs));
165}
166
167} // end namespace adl_detail
168
169template <typename ContainerTy>
170decltype(auto) adl_begin(ContainerTy &&container) {
171 return adl_detail::adl_begin(std::forward<ContainerTy>(container));
172}
173
174template <typename ContainerTy>
175decltype(auto) adl_end(ContainerTy &&container) {
176 return adl_detail::adl_end(std::forward<ContainerTy>(container));
177}
178
179template <typename T>
180void adl_swap(T &&lhs, T &&rhs) noexcept(
181 noexcept(adl_detail::adl_swap(std::declval<T>(), std::declval<T>()))) {
182 adl_detail::adl_swap(std::forward<T>(lhs), std::forward<T>(rhs));
183}
184
185/// Test whether \p RangeOrContainer is empty. Similar to C++17 std::empty.
186template <typename T>
187constexpr bool empty(const T &RangeOrContainer) {
188 return adl_begin(RangeOrContainer) == adl_end(RangeOrContainer);
189}
190
191/// Return a range covering \p RangeOrContainer with the first N elements
192/// excluded.
193template <typename T> auto drop_begin(T &&RangeOrContainer, size_t N) {
194 return make_range(std::next(adl_begin(RangeOrContainer), N),
195 adl_end(RangeOrContainer));
196}
197
198// mapped_iterator - This is a simple iterator adapter that causes a function to
199// be applied whenever operator* is invoked on the iterator.
200
201template <typename ItTy, typename FuncTy,
202 typename FuncReturnTy =
203 decltype(std::declval<FuncTy>()(*std::declval<ItTy>()))>
204class mapped_iterator
205 : public iterator_adaptor_base<
206 mapped_iterator<ItTy, FuncTy>, ItTy,
207 typename std::iterator_traits<ItTy>::iterator_category,
208 typename std::remove_reference<FuncReturnTy>::type> {
209public:
210 mapped_iterator(ItTy U, FuncTy F)
211 : mapped_iterator::iterator_adaptor_base(std::move(U)), F(std::move(F)) {}
212
213 ItTy getCurrent() { return this->I; }
214
215 FuncReturnTy operator*() { return F(*this->I); }
216
217private:
218 FuncTy F;
219};
220
221// map_iterator - Provide a convenient way to create mapped_iterators, just like
222// make_pair is useful for creating pairs...
223template <class ItTy, class FuncTy>
224inline mapped_iterator<ItTy, FuncTy> map_iterator(ItTy I, FuncTy F) {
225 return mapped_iterator<ItTy, FuncTy>(std::move(I), std::move(F));
226}
227
228template <class ContainerTy, class FuncTy>
229auto map_range(ContainerTy &&C, FuncTy F) {
230 return make_range(map_iterator(C.begin(), F), map_iterator(C.end(), F));
231}
232
233/// Helper to determine if type T has a member called rbegin().
234template <typename Ty> class has_rbegin_impl {
235 using yes = char[1];
236 using no = char[2];
237
238 template <typename Inner>
239 static yes& test(Inner *I, decltype(I->rbegin()) * = nullptr);
240
241 template <typename>
242 static no& test(...);
243
244public:
245 static const bool value = sizeof(test<Ty>(nullptr)) == sizeof(yes);
246};
247
248/// Metafunction to determine if T& or T has a member called rbegin().
249template <typename Ty>
250struct has_rbegin : has_rbegin_impl<typename std::remove_reference<Ty>::type> {
251};
252
253// Returns an iterator_range over the given container which iterates in reverse.
254// Note that the container must have rbegin()/rend() methods for this to work.
255template <typename ContainerTy>
256auto reverse(ContainerTy &&C,
257 std::enable_if_t<has_rbegin<ContainerTy>::value> * = nullptr) {
258 return make_range(C.rbegin(), C.rend());
259}
260
261// Returns a std::reverse_iterator wrapped around the given iterator.
262template <typename IteratorTy>
263std::reverse_iterator<IteratorTy> make_reverse_iterator(IteratorTy It) {
264 return std::reverse_iterator<IteratorTy>(It);
265}
266
267// Returns an iterator_range over the given container which iterates in reverse.
268// Note that the container must have begin()/end() methods which return
269// bidirectional iterators for this to work.
270template <typename ContainerTy>
271auto reverse(ContainerTy &&C,
272 std::enable_if_t<!has_rbegin<ContainerTy>::value> * = nullptr) {
273 return make_range(llvm::make_reverse_iterator(std::end(C)),
274 llvm::make_reverse_iterator(std::begin(C)));
275}
276
277/// An iterator adaptor that filters the elements of given inner iterators.
278///
279/// The predicate parameter should be a callable object that accepts the wrapped
280/// iterator's reference type and returns a bool. When incrementing or
281/// decrementing the iterator, it will call the predicate on each element and
282/// skip any where it returns false.
283///
284/// \code
285/// int A[] = { 1, 2, 3, 4 };
286/// auto R = make_filter_range(A, [](int N) { return N % 2 == 1; });
287/// // R contains { 1, 3 }.
288/// \endcode
289///
290/// Note: filter_iterator_base implements support for forward iteration.
291/// filter_iterator_impl exists to provide support for bidirectional iteration,
292/// conditional on whether the wrapped iterator supports it.
293template <typename WrappedIteratorT, typename PredicateT, typename IterTag>
294class filter_iterator_base
295 : public iterator_adaptor_base<
296 filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>,
297 WrappedIteratorT,
298 typename std::common_type<
299 IterTag, typename std::iterator_traits<
300 WrappedIteratorT>::iterator_category>::type> {
301 using BaseT = iterator_adaptor_base<
302 filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>,
303 WrappedIteratorT,
304 typename std::common_type<
305 IterTag, typename std::iterator_traits<
306 WrappedIteratorT>::iterator_category>::type>;
307
308protected:
309 WrappedIteratorT End;
310 PredicateT Pred;
311
312 void findNextValid() {
313 while (this->I != End && !Pred(*this->I))
314 BaseT::operator++();
315 }
316
317 // Construct the iterator. The begin iterator needs to know where the end
318 // is, so that it can properly stop when it gets there. The end iterator only
319 // needs the predicate to support bidirectional iteration.
320 filter_iterator_base(WrappedIteratorT Begin, WrappedIteratorT End,
321 PredicateT Pred)
322 : BaseT(Begin), End(End), Pred(Pred) {
323 findNextValid();
324 }
325
326public:
327 using BaseT::operator++;
328
329 filter_iterator_base &operator++() {
330 BaseT::operator++();
331 findNextValid();
332 return *this;
333 }
334};
335
336/// Specialization of filter_iterator_base for forward iteration only.
337template <typename WrappedIteratorT, typename PredicateT,
338 typename IterTag = std::forward_iterator_tag>
339class filter_iterator_impl
340 : public filter_iterator_base<WrappedIteratorT, PredicateT, IterTag> {
341 using BaseT = filter_iterator_base<WrappedIteratorT, PredicateT, IterTag>;
342
343public:
344 filter_iterator_impl(WrappedIteratorT Begin, WrappedIteratorT End,
345 PredicateT Pred)
346 : BaseT(Begin, End, Pred) {}
347};
348
349/// Specialization of filter_iterator_base for bidirectional iteration.
350template <typename WrappedIteratorT, typename PredicateT>
351class filter_iterator_impl<WrappedIteratorT, PredicateT,
352 std::bidirectional_iterator_tag>
353 : public filter_iterator_base<WrappedIteratorT, PredicateT,
354 std::bidirectional_iterator_tag> {
355 using BaseT = filter_iterator_base<WrappedIteratorT, PredicateT,
356 std::bidirectional_iterator_tag>;
357 void findPrevValid() {
358 while (!this->Pred(*this->I))
359 BaseT::operator--();
360 }
361
362public:
363 using BaseT::operator--;
364
365 filter_iterator_impl(WrappedIteratorT Begin, WrappedIteratorT End,
366 PredicateT Pred)
367 : BaseT(Begin, End, Pred) {}
368
369 filter_iterator_impl &operator--() {
370 BaseT::operator--();
371 findPrevValid();
372 return *this;
373 }
374};
375
376namespace detail {
377
378template <bool is_bidirectional> struct fwd_or_bidi_tag_impl {
379 using type = std::forward_iterator_tag;
380};
381
382template <> struct fwd_or_bidi_tag_impl<true> {
383 using type = std::bidirectional_iterator_tag;
384};
385
386/// Helper which sets its type member to forward_iterator_tag if the category
387/// of \p IterT does not derive from bidirectional_iterator_tag, and to
388/// bidirectional_iterator_tag otherwise.
389template <typename IterT> struct fwd_or_bidi_tag {
390 using type = typename fwd_or_bidi_tag_impl<std::is_base_of<
391 std::bidirectional_iterator_tag,
392 typename std::iterator_traits<IterT>::iterator_category>::value>::type;
393};
394
395} // namespace detail
396
397/// Defines filter_iterator to a suitable specialization of
398/// filter_iterator_impl, based on the underlying iterator's category.
399template <typename WrappedIteratorT, typename PredicateT>
400using filter_iterator = filter_iterator_impl<
401 WrappedIteratorT, PredicateT,
402 typename detail::fwd_or_bidi_tag<WrappedIteratorT>::type>;
403
404/// Convenience function that takes a range of elements and a predicate,
405/// and return a new filter_iterator range.
406///
407/// FIXME: Currently if RangeT && is a rvalue reference to a temporary, the
408/// lifetime of that temporary is not kept by the returned range object, and the
409/// temporary is going to be dropped on the floor after the make_iterator_range
410/// full expression that contains this function call.
411template <typename RangeT, typename PredicateT>
412iterator_range<filter_iterator<detail::IterOfRange<RangeT>, PredicateT>>
413make_filter_range(RangeT &&Range, PredicateT Pred) {
414 using FilterIteratorT =
415 filter_iterator<detail::IterOfRange<RangeT>, PredicateT>;
416 return make_range(
417 FilterIteratorT(std::begin(std::forward<RangeT>(Range)),
418 std::end(std::forward<RangeT>(Range)), Pred),
419 FilterIteratorT(std::end(std::forward<RangeT>(Range)),
420 std::end(std::forward<RangeT>(Range)), Pred));
421}
422
423/// A pseudo-iterator adaptor that is designed to implement "early increment"
424/// style loops.
425///
426/// This is *not a normal iterator* and should almost never be used directly. It
427/// is intended primarily to be used with range based for loops and some range
428/// algorithms.
429///
430/// The iterator isn't quite an `OutputIterator` or an `InputIterator` but
431/// somewhere between them. The constraints of these iterators are:
432///
433/// - On construction or after being incremented, it is comparable and
434/// dereferencable. It is *not* incrementable.
435/// - After being dereferenced, it is neither comparable nor dereferencable, it
436/// is only incrementable.
437///
438/// This means you can only dereference the iterator once, and you can only
439/// increment it once between dereferences.
440template <typename WrappedIteratorT>
441class early_inc_iterator_impl
442 : public iterator_adaptor_base<early_inc_iterator_impl<WrappedIteratorT>,
443 WrappedIteratorT, std::input_iterator_tag> {
444 using BaseT =
445 iterator_adaptor_base<early_inc_iterator_impl<WrappedIteratorT>,
446 WrappedIteratorT, std::input_iterator_tag>;
447
448 using PointerT = typename std::iterator_traits<WrappedIteratorT>::pointer;
449
450protected:
451#if LLVM_ENABLE_ABI_BREAKING_CHECKS1
452 bool IsEarlyIncremented = false;
453#endif
454
455public:
456 early_inc_iterator_impl(WrappedIteratorT I) : BaseT(I) {}
457
458 using BaseT::operator*;
459 typename BaseT::reference operator*() {
460#if LLVM_ENABLE_ABI_BREAKING_CHECKS1
461 assert(!IsEarlyIncremented && "Cannot dereference twice!")((!IsEarlyIncremented && "Cannot dereference twice!")
? static_cast<void> (0) : __assert_fail ("!IsEarlyIncremented && \"Cannot dereference twice!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/STLExtras.h"
, 461, __PRETTY_FUNCTION__))
;
462 IsEarlyIncremented = true;
463#endif
464 return *(this->I)++;
465 }
466
467 using BaseT::operator++;
468 early_inc_iterator_impl &operator++() {
469#if LLVM_ENABLE_ABI_BREAKING_CHECKS1
470 assert(IsEarlyIncremented && "Cannot increment before dereferencing!")((IsEarlyIncremented && "Cannot increment before dereferencing!"
) ? static_cast<void> (0) : __assert_fail ("IsEarlyIncremented && \"Cannot increment before dereferencing!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/STLExtras.h"
, 470, __PRETTY_FUNCTION__))
;
471 IsEarlyIncremented = false;
472#endif
473 return *this;
474 }
475
476 using BaseT::operator==;
477 bool operator==(const early_inc_iterator_impl &RHS) const {
478#if LLVM_ENABLE_ABI_BREAKING_CHECKS1
479 assert(!IsEarlyIncremented && "Cannot compare after dereferencing!")((!IsEarlyIncremented && "Cannot compare after dereferencing!"
) ? static_cast<void> (0) : __assert_fail ("!IsEarlyIncremented && \"Cannot compare after dereferencing!\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/STLExtras.h"
, 479, __PRETTY_FUNCTION__))
;
480#endif
481 return BaseT::operator==(RHS);
482 }
483};
484
485/// Make a range that does early increment to allow mutation of the underlying
486/// range without disrupting iteration.
487///
488/// The underlying iterator will be incremented immediately after it is
489/// dereferenced, allowing deletion of the current node or insertion of nodes to
490/// not disrupt iteration provided they do not invalidate the *next* iterator --
491/// the current iterator can be invalidated.
492///
493/// This requires a very exact pattern of use that is only really suitable to
494/// range based for loops and other range algorithms that explicitly guarantee
495/// to dereference exactly once each element, and to increment exactly once each
496/// element.
497template <typename RangeT>
498iterator_range<early_inc_iterator_impl<detail::IterOfRange<RangeT>>>
499make_early_inc_range(RangeT &&Range) {
500 using EarlyIncIteratorT =
501 early_inc_iterator_impl<detail::IterOfRange<RangeT>>;
502 return make_range(EarlyIncIteratorT(std::begin(std::forward<RangeT>(Range))),
503 EarlyIncIteratorT(std::end(std::forward<RangeT>(Range))));
504}
505
506// forward declarations required by zip_shortest/zip_first/zip_longest
507template <typename R, typename UnaryPredicate>
508bool all_of(R &&range, UnaryPredicate P);
509template <typename R, typename UnaryPredicate>
510bool any_of(R &&range, UnaryPredicate P);
511
512namespace detail {
513
514using std::declval;
515
516// We have to alias this since inlining the actual type at the usage site
517// in the parameter list of iterator_facade_base<> below ICEs MSVC 2017.
518template<typename... Iters> struct ZipTupleType {
519 using type = std::tuple<decltype(*declval<Iters>())...>;
520};
521
522template <typename ZipType, typename... Iters>
523using zip_traits = iterator_facade_base<
524 ZipType, typename std::common_type<std::bidirectional_iterator_tag,
525 typename std::iterator_traits<
526 Iters>::iterator_category...>::type,
527 // ^ TODO: Implement random access methods.
528 typename ZipTupleType<Iters...>::type,
529 typename std::iterator_traits<typename std::tuple_element<
530 0, std::tuple<Iters...>>::type>::difference_type,
531 // ^ FIXME: This follows boost::make_zip_iterator's assumption that all
532 // inner iterators have the same difference_type. It would fail if, for
533 // instance, the second field's difference_type were non-numeric while the
534 // first is.
535 typename ZipTupleType<Iters...>::type *,
536 typename ZipTupleType<Iters...>::type>;
537
538template <typename ZipType, typename... Iters>
539struct zip_common : public zip_traits<ZipType, Iters...> {
540 using Base = zip_traits<ZipType, Iters...>;
541 using value_type = typename Base::value_type;
542
543 std::tuple<Iters...> iterators;
544
545protected:
546 template <size_t... Ns> value_type deref(std::index_sequence<Ns...>) const {
547 return value_type(*std::get<Ns>(iterators)...);
548 }
549
550 template <size_t... Ns>
551 decltype(iterators) tup_inc(std::index_sequence<Ns...>) const {
552 return std::tuple<Iters...>(std::next(std::get<Ns>(iterators))...);
553 }
554
555 template <size_t... Ns>
556 decltype(iterators) tup_dec(std::index_sequence<Ns...>) const {
557 return std::tuple<Iters...>(std::prev(std::get<Ns>(iterators))...);
558 }
559
560public:
561 zip_common(Iters &&... ts) : iterators(std::forward<Iters>(ts)...) {}
562
563 value_type operator*() { return deref(std::index_sequence_for<Iters...>{}); }
564
565 const value_type operator*() const {
566 return deref(std::index_sequence_for<Iters...>{});
567 }
568
569 ZipType &operator++() {
570 iterators = tup_inc(std::index_sequence_for<Iters...>{});
571 return *reinterpret_cast<ZipType *>(this);
572 }
573
574 ZipType &operator--() {
575 static_assert(Base::IsBidirectional,
576 "All inner iterators must be at least bidirectional.");
577 iterators = tup_dec(std::index_sequence_for<Iters...>{});
578 return *reinterpret_cast<ZipType *>(this);
579 }
580};
581
582template <typename... Iters>
583struct zip_first : public zip_common<zip_first<Iters...>, Iters...> {
584 using Base = zip_common<zip_first<Iters...>, Iters...>;
585
586 bool operator==(const zip_first<Iters...> &other) const {
587 return std::get<0>(this->iterators) == std::get<0>(other.iterators);
588 }
589
590 zip_first(Iters &&... ts) : Base(std::forward<Iters>(ts)...) {}
591};
592
593template <typename... Iters>
594class zip_shortest : public zip_common<zip_shortest<Iters...>, Iters...> {
595 template <size_t... Ns>
596 bool test(const zip_shortest<Iters...> &other,
597 std::index_sequence<Ns...>) const {
598 return all_of(std::initializer_list<bool>{std::get<Ns>(this->iterators) !=
599 std::get<Ns>(other.iterators)...},
600 identity<bool>{});
601 }
602
603public:
604 using Base = zip_common<zip_shortest<Iters...>, Iters...>;
605
606 zip_shortest(Iters &&... ts) : Base(std::forward<Iters>(ts)...) {}
607
608 bool operator==(const zip_shortest<Iters...> &other) const {
609 return !test(other, std::index_sequence_for<Iters...>{});
610 }
611};
612
613template <template <typename...> class ItType, typename... Args> class zippy {
614public:
615 using iterator = ItType<decltype(std::begin(std::declval<Args>()))...>;
616 using iterator_category = typename iterator::iterator_category;
617 using value_type = typename iterator::value_type;
618 using difference_type = typename iterator::difference_type;
619 using pointer = typename iterator::pointer;
620 using reference = typename iterator::reference;
621
622private:
623 std::tuple<Args...> ts;
624
625 template <size_t... Ns>
626 iterator begin_impl(std::index_sequence<Ns...>) const {
627 return iterator(std::begin(std::get<Ns>(ts))...);
628 }
629 template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) const {
630 return iterator(std::end(std::get<Ns>(ts))...);
631 }
632
633public:
634 zippy(Args &&... ts_) : ts(std::forward<Args>(ts_)...) {}
635
636 iterator begin() const {
637 return begin_impl(std::index_sequence_for<Args...>{});
638 }
639 iterator end() const { return end_impl(std::index_sequence_for<Args...>{}); }
640};
641
642} // end namespace detail
643
644/// zip iterator for two or more iteratable types.
645template <typename T, typename U, typename... Args>
646detail::zippy<detail::zip_shortest, T, U, Args...> zip(T &&t, U &&u,
647 Args &&... args) {
648 return detail::zippy<detail::zip_shortest, T, U, Args...>(
649 std::forward<T>(t), std::forward<U>(u), std::forward<Args>(args)...);
650}
651
652/// zip iterator that, for the sake of efficiency, assumes the first iteratee to
653/// be the shortest.
654template <typename T, typename U, typename... Args>
655detail::zippy<detail::zip_first, T, U, Args...> zip_first(T &&t, U &&u,
656 Args &&... args) {
657 return detail::zippy<detail::zip_first, T, U, Args...>(
658 std::forward<T>(t), std::forward<U>(u), std::forward<Args>(args)...);
659}
660
661namespace detail {
662template <typename Iter>
663static Iter next_or_end(const Iter &I, const Iter &End) {
664 if (I == End)
665 return End;
666 return std::next(I);
667}
668
669template <typename Iter>
670static auto deref_or_none(const Iter &I, const Iter &End) -> llvm::Optional<
671 std::remove_const_t<std::remove_reference_t<decltype(*I)>>> {
672 if (I == End)
673 return None;
674 return *I;
675}
676
677template <typename Iter> struct ZipLongestItemType {
678 using type =
679 llvm::Optional<typename std::remove_const<typename std::remove_reference<
680 decltype(*std::declval<Iter>())>::type>::type>;
681};
682
683template <typename... Iters> struct ZipLongestTupleType {
684 using type = std::tuple<typename ZipLongestItemType<Iters>::type...>;
685};
686
687template <typename... Iters>
688class zip_longest_iterator
689 : public iterator_facade_base<
690 zip_longest_iterator<Iters...>,
691 typename std::common_type<
692 std::forward_iterator_tag,
693 typename std::iterator_traits<Iters>::iterator_category...>::type,
694 typename ZipLongestTupleType<Iters...>::type,
695 typename std::iterator_traits<typename std::tuple_element<
696 0, std::tuple<Iters...>>::type>::difference_type,
697 typename ZipLongestTupleType<Iters...>::type *,
698 typename ZipLongestTupleType<Iters...>::type> {
699public:
700 using value_type = typename ZipLongestTupleType<Iters...>::type;
701
702private:
703 std::tuple<Iters...> iterators;
704 std::tuple<Iters...> end_iterators;
705
706 template <size_t... Ns>
707 bool test(const zip_longest_iterator<Iters...> &other,
708 std::index_sequence<Ns...>) const {
709 return llvm::any_of(
710 std::initializer_list<bool>{std::get<Ns>(this->iterators) !=
711 std::get<Ns>(other.iterators)...},
712 identity<bool>{});
713 }
714
715 template <size_t... Ns> value_type deref(std::index_sequence<Ns...>) const {
716 return value_type(
717 deref_or_none(std::get<Ns>(iterators), std::get<Ns>(end_iterators))...);
718 }
719
720 template <size_t... Ns>
721 decltype(iterators) tup_inc(std::index_sequence<Ns...>) const {
722 return std::tuple<Iters...>(
723 next_or_end(std::get<Ns>(iterators), std::get<Ns>(end_iterators))...);
724 }
725
726public:
727 zip_longest_iterator(std::pair<Iters &&, Iters &&>... ts)
728 : iterators(std::forward<Iters>(ts.first)...),
729 end_iterators(std::forward<Iters>(ts.second)...) {}
730
731 value_type operator*() { return deref(std::index_sequence_for<Iters...>{}); }
732
733 value_type operator*() const {
734 return deref(std::index_sequence_for<Iters...>{});
735 }
736
737 zip_longest_iterator<Iters...> &operator++() {
738 iterators = tup_inc(std::index_sequence_for<Iters...>{});
739 return *this;
740 }
741
742 bool operator==(const zip_longest_iterator<Iters...> &other) const {
743 return !test(other, std::index_sequence_for<Iters...>{});
744 }
745};
746
747template <typename... Args> class zip_longest_range {
748public:
749 using iterator =
750 zip_longest_iterator<decltype(adl_begin(std::declval<Args>()))...>;
751 using iterator_category = typename iterator::iterator_category;
752 using value_type = typename iterator::value_type;
753 using difference_type = typename iterator::difference_type;
754 using pointer = typename iterator::pointer;
755 using reference = typename iterator::reference;
756
757private:
758 std::tuple<Args...> ts;
759
760 template <size_t... Ns>
761 iterator begin_impl(std::index_sequence<Ns...>) const {
762 return iterator(std::make_pair(adl_begin(std::get<Ns>(ts)),
763 adl_end(std::get<Ns>(ts)))...);
764 }
765
766 template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) const {
767 return iterator(std::make_pair(adl_end(std::get<Ns>(ts)),
768 adl_end(std::get<Ns>(ts)))...);
769 }
770
771public:
772 zip_longest_range(Args &&... ts_) : ts(std::forward<Args>(ts_)...) {}
773
774 iterator begin() const {
775 return begin_impl(std::index_sequence_for<Args...>{});
776 }
777 iterator end() const { return end_impl(std::index_sequence_for<Args...>{}); }
778};
779} // namespace detail
780
781/// Iterate over two or more iterators at the same time. Iteration continues
782/// until all iterators reach the end. The llvm::Optional only contains a value
783/// if the iterator has not reached the end.
784template <typename T, typename U, typename... Args>
785detail::zip_longest_range<T, U, Args...> zip_longest(T &&t, U &&u,
786 Args &&... args) {
787 return detail::zip_longest_range<T, U, Args...>(
788 std::forward<T>(t), std::forward<U>(u), std::forward<Args>(args)...);
789}
790
791/// Iterator wrapper that concatenates sequences together.
792///
793/// This can concatenate different iterators, even with different types, into
794/// a single iterator provided the value types of all the concatenated
795/// iterators expose `reference` and `pointer` types that can be converted to
796/// `ValueT &` and `ValueT *` respectively. It doesn't support more
797/// interesting/customized pointer or reference types.
798///
799/// Currently this only supports forward or higher iterator categories as
800/// inputs and always exposes a forward iterator interface.
801template <typename ValueT, typename... IterTs>
802class concat_iterator
803 : public iterator_facade_base<concat_iterator<ValueT, IterTs...>,
804 std::forward_iterator_tag, ValueT> {
805 using BaseT = typename concat_iterator::iterator_facade_base;
806
807 /// We store both the current and end iterators for each concatenated
808 /// sequence in a tuple of pairs.
809 ///
810 /// Note that something like iterator_range seems nice at first here, but the
811 /// range properties are of little benefit and end up getting in the way
812 /// because we need to do mutation on the current iterators.
813 std::tuple<IterTs...> Begins;
814 std::tuple<IterTs...> Ends;
815
816 /// Attempts to increment a specific iterator.
817 ///
818 /// Returns true if it was able to increment the iterator. Returns false if
819 /// the iterator is already at the end iterator.
820 template <size_t Index> bool incrementHelper() {
821 auto &Begin = std::get<Index>(Begins);
822 auto &End = std::get<Index>(Ends);
823 if (Begin == End)
824 return false;
825
826 ++Begin;
827 return true;
828 }
829
830 /// Increments the first non-end iterator.
831 ///
832 /// It is an error to call this with all iterators at the end.
833 template <size_t... Ns> void increment(std::index_sequence<Ns...>) {
834 // Build a sequence of functions to increment each iterator if possible.
835 bool (concat_iterator::*IncrementHelperFns[])() = {
836 &concat_iterator::incrementHelper<Ns>...};
837
838 // Loop over them, and stop as soon as we succeed at incrementing one.
839 for (auto &IncrementHelperFn : IncrementHelperFns)
840 if ((this->*IncrementHelperFn)())
841 return;
842
843 llvm_unreachable("Attempted to increment an end concat iterator!")::llvm::llvm_unreachable_internal("Attempted to increment an end concat iterator!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/STLExtras.h"
, 843)
;
844 }
845
846 /// Returns null if the specified iterator is at the end. Otherwise,
847 /// dereferences the iterator and returns the address of the resulting
848 /// reference.
849 template <size_t Index> ValueT *getHelper() const {
850 auto &Begin = std::get<Index>(Begins);
851 auto &End = std::get<Index>(Ends);
852 if (Begin == End)
853 return nullptr;
854
855 return &*Begin;
856 }
857
858 /// Finds the first non-end iterator, dereferences, and returns the resulting
859 /// reference.
860 ///
861 /// It is an error to call this with all iterators at the end.
862 template <size_t... Ns> ValueT &get(std::index_sequence<Ns...>) const {
863 // Build a sequence of functions to get from iterator if possible.
864 ValueT *(concat_iterator::*GetHelperFns[])() const = {
865 &concat_iterator::getHelper<Ns>...};
866
867 // Loop over them, and return the first result we find.
868 for (auto &GetHelperFn : GetHelperFns)
869 if (ValueT *P = (this->*GetHelperFn)())
870 return *P;
871
872 llvm_unreachable("Attempted to get a pointer from an end concat iterator!")::llvm::llvm_unreachable_internal("Attempted to get a pointer from an end concat iterator!"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/STLExtras.h"
, 872)
;
873 }
874
875public:
876 /// Constructs an iterator from a squence of ranges.
877 ///
878 /// We need the full range to know how to switch between each of the
879 /// iterators.
880 template <typename... RangeTs>
881 explicit concat_iterator(RangeTs &&... Ranges)
882 : Begins(std::begin(Ranges)...), Ends(std::end(Ranges)...) {}
883
884 using BaseT::operator++;
885
886 concat_iterator &operator++() {
887 increment(std::index_sequence_for<IterTs...>());
888 return *this;
889 }
890
891 ValueT &operator*() const {
892 return get(std::index_sequence_for<IterTs...>());
893 }
894
895 bool operator==(const concat_iterator &RHS) const {
896 return Begins == RHS.Begins && Ends == RHS.Ends;
897 }
898};
899
900namespace detail {
901
902/// Helper to store a sequence of ranges being concatenated and access them.
903///
904/// This is designed to facilitate providing actual storage when temporaries
905/// are passed into the constructor such that we can use it as part of range
906/// based for loops.
907template <typename ValueT, typename... RangeTs> class concat_range {
908public:
909 using iterator =
910 concat_iterator<ValueT,
911 decltype(std::begin(std::declval<RangeTs &>()))...>;
912
913private:
914 std::tuple<RangeTs...> Ranges;
915
916 template <size_t... Ns> iterator begin_impl(std::index_sequence<Ns...>) {
917 return iterator(std::get<Ns>(Ranges)...);
918 }
919 template <size_t... Ns> iterator end_impl(std::index_sequence<Ns...>) {
920 return iterator(make_range(std::end(std::get<Ns>(Ranges)),
921 std::end(std::get<Ns>(Ranges)))...);
922 }
923
924public:
925 concat_range(RangeTs &&... Ranges)
926 : Ranges(std::forward<RangeTs>(Ranges)...) {}
927
928 iterator begin() { return begin_impl(std::index_sequence_for<RangeTs...>{}); }
929 iterator end() { return end_impl(std::index_sequence_for<RangeTs...>{}); }
930};
931
932} // end namespace detail
933
934/// Concatenated range across two or more ranges.
935///
936/// The desired value type must be explicitly specified.
937template <typename ValueT, typename... RangeTs>
938detail::concat_range<ValueT, RangeTs...> concat(RangeTs &&... Ranges) {
939 static_assert(sizeof...(RangeTs) > 1,
940 "Need more than one range to concatenate!");
941 return detail::concat_range<ValueT, RangeTs...>(
942 std::forward<RangeTs>(Ranges)...);
943}
944
945//===----------------------------------------------------------------------===//
946// Extra additions to <utility>
947//===----------------------------------------------------------------------===//
948
949/// Function object to check whether the first component of a std::pair
950/// compares less than the first component of another std::pair.
951struct less_first {
952 template <typename T> bool operator()(const T &lhs, const T &rhs) const {
953 return lhs.first < rhs.first;
954 }
955};
956
957/// Function object to check whether the second component of a std::pair
958/// compares less than the second component of another std::pair.
959struct less_second {
960 template <typename T> bool operator()(const T &lhs, const T &rhs) const {
961 return lhs.second < rhs.second;
962 }
963};
964
965/// \brief Function object to apply a binary function to the first component of
966/// a std::pair.
967template<typename FuncTy>
968struct on_first {
969 FuncTy func;
970
971 template <typename T>
972 decltype(auto) operator()(const T &lhs, const T &rhs) const {
973 return func(lhs.first, rhs.first);
974 }
975};
976
977/// Utility type to build an inheritance chain that makes it easy to rank
978/// overload candidates.
979template <int N> struct rank : rank<N - 1> {};
980template <> struct rank<0> {};
981
982/// traits class for checking whether type T is one of any of the given
983/// types in the variadic list.
984template <typename T, typename... Ts> struct is_one_of {
985 static const bool value = false;
986};
987
988template <typename T, typename U, typename... Ts>
989struct is_one_of<T, U, Ts...> {
990 static const bool value =
991 std::is_same<T, U>::value || is_one_of<T, Ts...>::value;
992};
993
994/// traits class for checking whether type T is a base class for all
995/// the given types in the variadic list.
996template <typename T, typename... Ts> struct are_base_of {
997 static const bool value = true;
998};
999
1000template <typename T, typename U, typename... Ts>
1001struct are_base_of<T, U, Ts...> {
1002 static const bool value =
1003 std::is_base_of<T, U>::value && are_base_of<T, Ts...>::value;
1004};
1005
1006//===----------------------------------------------------------------------===//
1007// Extra additions for arrays
1008//===----------------------------------------------------------------------===//
1009
1010// We have a copy here so that LLVM behaves the same when using different
1011// standard libraries.
1012template <class Iterator, class RNG>
1013void shuffle(Iterator first, Iterator last, RNG &&g) {
1014 // It would be better to use a std::uniform_int_distribution,
1015 // but that would be stdlib dependent.
1016 for (auto size = last - first; size > 1; ++first, (void)--size)
1017 std::iter_swap(first, first + g() % size);
1018}
1019
1020/// Find the length of an array.
1021template <class T, std::size_t N>
1022constexpr inline size_t array_lengthof(T (&)[N]) {
1023 return N;
1024}
1025
1026/// Adapt std::less<T> for array_pod_sort.
1027template<typename T>
1028inline int array_pod_sort_comparator(const void *P1, const void *P2) {
1029 if (std::less<T>()(*reinterpret_cast<const T*>(P1),
1030 *reinterpret_cast<const T*>(P2)))
1031 return -1;
1032 if (std::less<T>()(*reinterpret_cast<const T*>(P2),
1033 *reinterpret_cast<const T*>(P1)))
1034 return 1;
1035 return 0;
1036}
1037
1038/// get_array_pod_sort_comparator - This is an internal helper function used to
1039/// get type deduction of T right.
1040template<typename T>
1041inline int (*get_array_pod_sort_comparator(const T &))
1042 (const void*, const void*) {
1043 return array_pod_sort_comparator<T>;
1044}
1045
1046#ifdef EXPENSIVE_CHECKS
1047namespace detail {
1048
1049inline unsigned presortShuffleEntropy() {
1050 static unsigned Result(std::random_device{}());
1051 return Result;
1052}
1053
1054template <class IteratorTy>
1055inline void presortShuffle(IteratorTy Start, IteratorTy End) {
1056 std::mt19937 Generator(presortShuffleEntropy());
1057 std::shuffle(Start, End, Generator);
1058}
1059
1060} // end namespace detail
1061#endif
1062
1063/// array_pod_sort - This sorts an array with the specified start and end
1064/// extent. This is just like std::sort, except that it calls qsort instead of
1065/// using an inlined template. qsort is slightly slower than std::sort, but
1066/// most sorts are not performance critical in LLVM and std::sort has to be
1067/// template instantiated for each type, leading to significant measured code
1068/// bloat. This function should generally be used instead of std::sort where
1069/// possible.
1070///
1071/// This function assumes that you have simple POD-like types that can be
1072/// compared with std::less and can be moved with memcpy. If this isn't true,
1073/// you should use std::sort.
1074///
1075/// NOTE: If qsort_r were portable, we could allow a custom comparator and
1076/// default to std::less.
1077template<class IteratorTy>
1078inline void array_pod_sort(IteratorTy Start, IteratorTy End) {
1079 // Don't inefficiently call qsort with one element or trigger undefined
1080 // behavior with an empty sequence.
1081 auto NElts = End - Start;
1082 if (NElts <= 1) return;
1083#ifdef EXPENSIVE_CHECKS
1084 detail::presortShuffle<IteratorTy>(Start, End);
1085#endif
1086 qsort(&*Start, NElts, sizeof(*Start), get_array_pod_sort_comparator(*Start));
1087}
1088
1089template <class IteratorTy>
1090inline void array_pod_sort(
1091 IteratorTy Start, IteratorTy End,
1092 int (*Compare)(
1093 const typename std::iterator_traits<IteratorTy>::value_type *,
1094 const typename std::iterator_traits<IteratorTy>::value_type *)) {
1095 // Don't inefficiently call qsort with one element or trigger undefined
1096 // behavior with an empty sequence.
1097 auto NElts = End - Start;
1098 if (NElts <= 1) return;
1099#ifdef EXPENSIVE_CHECKS
1100 detail::presortShuffle<IteratorTy>(Start, End);
1101#endif
1102 qsort(&*Start, NElts, sizeof(*Start),
1103 reinterpret_cast<int (*)(const void *, const void *)>(Compare));
1104}
1105
1106// Provide wrappers to std::sort which shuffle the elements before sorting
1107// to help uncover non-deterministic behavior (PR35135).
1108template <typename IteratorTy>
1109inline void sort(IteratorTy Start, IteratorTy End) {
1110#ifdef EXPENSIVE_CHECKS
1111 detail::presortShuffle<IteratorTy>(Start, End);
1112#endif
1113 std::sort(Start, End);
1114}
1115
1116template <typename Container> inline void sort(Container &&C) {
1117 llvm::sort(adl_begin(C), adl_end(C));
1118}
1119
1120template <typename IteratorTy, typename Compare>
1121inline void sort(IteratorTy Start, IteratorTy End, Compare Comp) {
1122#ifdef EXPENSIVE_CHECKS
1123 detail::presortShuffle<IteratorTy>(Start, End);
1124#endif
1125 std::sort(Start, End, Comp);
1126}
1127
1128template <typename Container, typename Compare>
1129inline void sort(Container &&C, Compare Comp) {
1130 llvm::sort(adl_begin(C), adl_end(C), Comp);
1131}
1132
1133//===----------------------------------------------------------------------===//
1134// Extra additions to <algorithm>
1135//===----------------------------------------------------------------------===//
1136
1137/// For a container of pointers, deletes the pointers and then clears the
1138/// container.
1139template<typename Container>
1140void DeleteContainerPointers(Container &C) {
1141 for (auto V : C)
1142 delete V;
1143 C.clear();
1144}
1145
1146/// In a container of pairs (usually a map) whose second element is a pointer,
1147/// deletes the second elements and then clears the container.
1148template<typename Container>
1149void DeleteContainerSeconds(Container &C) {
1150 for (auto &V : C)
1151 delete V.second;
1152 C.clear();
1153}
1154
1155/// Get the size of a range. This is a wrapper function around std::distance
1156/// which is only enabled when the operation is O(1).
1157template <typename R>
1158auto size(R &&Range,
1159 std::enable_if_t<std::is_same<typename std::iterator_traits<decltype(
1160 Range.begin())>::iterator_category,
1161 std::random_access_iterator_tag>::value,
1162 void> * = nullptr) {
1163 return std::distance(Range.begin(), Range.end());
1164}
1165
1166/// Provide wrappers to std::for_each which take ranges instead of having to
1167/// pass begin/end explicitly.
1168template <typename R, typename UnaryPredicate>
1169UnaryPredicate for_each(R &&Range, UnaryPredicate P) {
1170 return std::for_each(adl_begin(Range), adl_end(Range), P);
1171}
1172
1173/// Provide wrappers to std::all_of which take ranges instead of having to pass
1174/// begin/end explicitly.
1175template <typename R, typename UnaryPredicate>
1176bool all_of(R &&Range, UnaryPredicate P) {
1177 return std::all_of(adl_begin(Range), adl_end(Range), P);
1178}
1179
1180/// Provide wrappers to std::any_of which take ranges instead of having to pass
1181/// begin/end explicitly.
1182template <typename R, typename UnaryPredicate>
1183bool any_of(R &&Range, UnaryPredicate P) {
1184 return std::any_of(adl_begin(Range), adl_end(Range), P);
34
Calling 'any_of<llvm::SDValue *, (lambda at /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp:34578:30)>'
39
Returning from 'any_of<llvm::SDValue *, (lambda at /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp:34578:30)>'
40
Returning zero, which participates in a condition later
1185}
1186
1187/// Provide wrappers to std::none_of which take ranges instead of having to pass
1188/// begin/end explicitly.
1189template <typename R, typename UnaryPredicate>
1190bool none_of(R &&Range, UnaryPredicate P) {
1191 return std::none_of(adl_begin(Range), adl_end(Range), P);
1192}
1193
1194/// Provide wrappers to std::find which take ranges instead of having to pass
1195/// begin/end explicitly.
1196template <typename R, typename T> auto find(R &&Range, const T &Val) {
1197 return std::find(adl_begin(Range), adl_end(Range), Val);
1198}
1199
1200/// Provide wrappers to std::find_if which take ranges instead of having to pass
1201/// begin/end explicitly.
1202template <typename R, typename UnaryPredicate>
1203auto find_if(R &&Range, UnaryPredicate P) {
1204 return std::find_if(adl_begin(Range), adl_end(Range), P);
1205}
1206
1207template <typename R, typename UnaryPredicate>
1208auto find_if_not(R &&Range, UnaryPredicate P) {
1209 return std::find_if_not(adl_begin(Range), adl_end(Range), P);
1210}
1211
1212/// Provide wrappers to std::remove_if which take ranges instead of having to
1213/// pass begin/end explicitly.
1214template <typename R, typename UnaryPredicate>
1215auto remove_if(R &&Range, UnaryPredicate P) {
1216 return std::remove_if(adl_begin(Range), adl_end(Range), P);
1217}
1218
1219/// Provide wrappers to std::copy_if which take ranges instead of having to
1220/// pass begin/end explicitly.
1221template <typename R, typename OutputIt, typename UnaryPredicate>
1222OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P) {
1223 return std::copy_if(adl_begin(Range), adl_end(Range), Out, P);
1224}
1225
1226template <typename R, typename OutputIt>
1227OutputIt copy(R &&Range, OutputIt Out) {
1228 return std::copy(adl_begin(Range), adl_end(Range), Out);
1229}
1230
1231/// Wrapper function around std::find to detect if an element exists
1232/// in a container.
1233template <typename R, typename E>
1234bool is_contained(R &&Range, const E &Element) {
1235 return std::find(adl_begin(Range), adl_end(Range), Element) != adl_end(Range);
1236}
1237
1238/// Wrapper function around std::count to count the number of times an element
1239/// \p Element occurs in the given range \p Range.
1240template <typename R, typename E> auto count(R &&Range, const E &Element) {
1241 return std::count(adl_begin(Range), adl_end(Range), Element);
1242}
1243
1244/// Wrapper function around std::count_if to count the number of times an
1245/// element satisfying a given predicate occurs in a range.
1246template <typename R, typename UnaryPredicate>
1247auto count_if(R &&Range, UnaryPredicate P) {
1248 return std::count_if(adl_begin(Range), adl_end(Range), P);
1249}
1250
1251/// Wrapper function around std::transform to apply a function to a range and
1252/// store the result elsewhere.
1253template <typename R, typename OutputIt, typename UnaryPredicate>
1254OutputIt transform(R &&Range, OutputIt d_first, UnaryPredicate P) {
1255 return std::transform(adl_begin(Range), adl_end(Range), d_first, P);
1256}
1257
1258/// Provide wrappers to std::partition which take ranges instead of having to
1259/// pass begin/end explicitly.
1260template <typename R, typename UnaryPredicate>
1261auto partition(R &&Range, UnaryPredicate P) {
1262 return std::partition(adl_begin(Range), adl_end(Range), P);
1263}
1264
1265/// Provide wrappers to std::lower_bound which take ranges instead of having to
1266/// pass begin/end explicitly.
1267template <typename R, typename T> auto lower_bound(R &&Range, T &&Value) {
1268 return std::lower_bound(adl_begin(Range), adl_end(Range),
1269 std::forward<T>(Value));
1270}
1271
1272template <typename R, typename T, typename Compare>
1273auto lower_bound(R &&Range, T &&Value, Compare C) {
1274 return std::lower_bound(adl_begin(Range), adl_end(Range),
1275 std::forward<T>(Value), C);
1276}
1277
1278/// Provide wrappers to std::upper_bound which take ranges instead of having to
1279/// pass begin/end explicitly.
1280template <typename R, typename T> auto upper_bound(R &&Range, T &&Value) {
1281 return std::upper_bound(adl_begin(Range), adl_end(Range),
1282 std::forward<T>(Value));
1283}
1284
1285template <typename R, typename T, typename Compare>
1286auto upper_bound(R &&Range, T &&Value, Compare C) {
1287 return std::upper_bound(adl_begin(Range), adl_end(Range),
1288 std::forward<T>(Value), C);
1289}
1290
1291template <typename R>
1292void stable_sort(R &&Range) {
1293 std::stable_sort(adl_begin(Range), adl_end(Range));
1294}
1295
1296template <typename R, typename Compare>
1297void stable_sort(R &&Range, Compare C) {
1298 std::stable_sort(adl_begin(Range), adl_end(Range), C);
1299}
1300
1301/// Binary search for the first iterator in a range where a predicate is false.
1302/// Requires that C is always true below some limit, and always false above it.
1303template <typename R, typename Predicate,
1304 typename Val = decltype(*adl_begin(std::declval<R>()))>
1305auto partition_point(R &&Range, Predicate P) {
1306 return std::partition_point(adl_begin(Range), adl_end(Range), P);
1307}
1308
1309/// Wrapper function around std::equal to detect if all elements
1310/// in a container are same.
1311template <typename R>
1312bool is_splat(R &&Range) {
1313 size_t range_size = size(Range);
1314 return range_size != 0 && (range_size == 1 ||
1315 std::equal(adl_begin(Range) + 1, adl_end(Range), adl_begin(Range)));
1316}
1317
1318/// Provide a container algorithm similar to C++ Library Fundamentals v2's
1319/// `erase_if` which is equivalent to:
1320///
1321/// C.erase(remove_if(C, pred), C.end());
1322///
1323/// This version works for any container with an erase method call accepting
1324/// two iterators.
1325template <typename Container, typename UnaryPredicate>
1326void erase_if(Container &C, UnaryPredicate P) {
1327 C.erase(remove_if(C, P), C.end());
1328}
1329
1330/// Given a sequence container Cont, replace the range [ContIt, ContEnd) with
1331/// the range [ValIt, ValEnd) (which is not from the same container).
1332template<typename Container, typename RandomAccessIterator>
1333void replace(Container &Cont, typename Container::iterator ContIt,
1334 typename Container::iterator ContEnd, RandomAccessIterator ValIt,
1335 RandomAccessIterator ValEnd) {
1336 while (true) {
1337 if (ValIt == ValEnd) {
1338 Cont.erase(ContIt, ContEnd);
1339 return;
1340 } else if (ContIt == ContEnd) {
1341 Cont.insert(ContIt, ValIt, ValEnd);
1342 return;
1343 }
1344 *ContIt++ = *ValIt++;
1345 }
1346}
1347
1348/// Given a sequence container Cont, replace the range [ContIt, ContEnd) with
1349/// the range R.
1350template<typename Container, typename Range = std::initializer_list<
1351 typename Container::value_type>>
1352void replace(Container &Cont, typename Container::iterator ContIt,
1353 typename Container::iterator ContEnd, Range R) {
1354 replace(Cont, ContIt, ContEnd, R.begin(), R.end());
1355}
1356
1357//===----------------------------------------------------------------------===//
1358// Extra additions to <memory>
1359//===----------------------------------------------------------------------===//
1360
1361struct FreeDeleter {
1362 void operator()(void* v) {
1363 ::free(v);
1364 }
1365};
1366
1367template<typename First, typename Second>
1368struct pair_hash {
1369 size_t operator()(const std::pair<First, Second> &P) const {
1370 return std::hash<First>()(P.first) * 31 + std::hash<Second>()(P.second);
1371 }
1372};
1373
1374/// Binary functor that adapts to any other binary functor after dereferencing
1375/// operands.
1376template <typename T> struct deref {
1377 T func;
1378
1379 // Could be further improved to cope with non-derivable functors and
1380 // non-binary functors (should be a variadic template member function
1381 // operator()).
1382 template <typename A, typename B> auto operator()(A &lhs, B &rhs) const {
1383 assert(lhs)((lhs) ? static_cast<void> (0) : __assert_fail ("lhs", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/STLExtras.h"
, 1383, __PRETTY_FUNCTION__))
;
1384 assert(rhs)((rhs) ? static_cast<void> (0) : __assert_fail ("rhs", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/STLExtras.h"
, 1384, __PRETTY_FUNCTION__))
;
1385 return func(*lhs, *rhs);
1386 }
1387};
1388
1389namespace detail {
1390
1391template <typename R> class enumerator_iter;
1392
1393template <typename R> struct result_pair {
1394 using value_reference =
1395 typename std::iterator_traits<IterOfRange<R>>::reference;
1396
1397 friend class enumerator_iter<R>;
1398
1399 result_pair() = default;
1400 result_pair(std::size_t Index, IterOfRange<R> Iter)
1401 : Index(Index), Iter(Iter) {}
1402
1403 result_pair<R>(const result_pair<R> &Other)
1404 : Index(Other.Index), Iter(Other.Iter) {}
1405 result_pair<R> &operator=(const result_pair<R> &Other) {
1406 Index = Other.Index;
1407 Iter = Other.Iter;
1408 return *this;
1409 }
1410
1411 std::size_t index() const { return Index; }
1412 const value_reference value() const { return *Iter; }
1413 value_reference value() { return *Iter; }
1414
1415private:
1416 std::size_t Index = std::numeric_limits<std::size_t>::max();
1417 IterOfRange<R> Iter;
1418};
1419
1420template <typename R>
1421class enumerator_iter
1422 : public iterator_facade_base<
1423 enumerator_iter<R>, std::forward_iterator_tag, result_pair<R>,
1424 typename std::iterator_traits<IterOfRange<R>>::difference_type,
1425 typename std::iterator_traits<IterOfRange<R>>::pointer,
1426 typename std::iterator_traits<IterOfRange<R>>::reference> {
1427 using result_type = result_pair<R>;
1428
1429public:
1430 explicit enumerator_iter(IterOfRange<R> EndIter)
1431 : Result(std::numeric_limits<size_t>::max(), EndIter) {}
1432
1433 enumerator_iter(std::size_t Index, IterOfRange<R> Iter)
1434 : Result(Index, Iter) {}
1435
1436 result_type &operator*() { return Result; }
1437 const result_type &operator*() const { return Result; }
1438
1439 enumerator_iter<R> &operator++() {
1440 assert(Result.Index != std::numeric_limits<size_t>::max())((Result.Index != std::numeric_limits<size_t>::max()) ?
static_cast<void> (0) : __assert_fail ("Result.Index != std::numeric_limits<size_t>::max()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/STLExtras.h"
, 1440, __PRETTY_FUNCTION__))
;
1441 ++Result.Iter;
1442 ++Result.Index;
1443 return *this;
1444 }
1445
1446 bool operator==(const enumerator_iter<R> &RHS) const {
1447 // Don't compare indices here, only iterators. It's possible for an end
1448 // iterator to have different indices depending on whether it was created
1449 // by calling std::end() versus incrementing a valid iterator.
1450 return Result.Iter == RHS.Result.Iter;
1451 }
1452
1453 enumerator_iter<R>(const enumerator_iter<R> &Other) : Result(Other.Result) {}
1454 enumerator_iter<R> &operator=(const enumerator_iter<R> &Other) {
1455 Result = Other.Result;
1456 return *this;
1457 }
1458
1459private:
1460 result_type Result;
1461};
1462
1463template <typename R> class enumerator {
1464public:
1465 explicit enumerator(R &&Range) : TheRange(std::forward<R>(Range)) {}
1466
1467 enumerator_iter<R> begin() {
1468 return enumerator_iter<R>(0, std::begin(TheRange));
1469 }
1470
1471 enumerator_iter<R> end() {
1472 return enumerator_iter<R>(std::end(TheRange));
1473 }
1474
1475private:
1476 R TheRange;
1477};
1478
1479} // end namespace detail
1480
1481/// Given an input range, returns a new range whose values are are pair (A,B)
1482/// such that A is the 0-based index of the item in the sequence, and B is
1483/// the value from the original sequence. Example:
1484///
1485/// std::vector<char> Items = {'A', 'B', 'C', 'D'};
1486/// for (auto X : enumerate(Items)) {
1487/// printf("Item %d - %c\n", X.index(), X.value());
1488/// }
1489///
1490/// Output:
1491/// Item 0 - A
1492/// Item 1 - B
1493/// Item 2 - C
1494/// Item 3 - D
1495///
1496template <typename R> detail::enumerator<R> enumerate(R &&TheRange) {
1497 return detail::enumerator<R>(std::forward<R>(TheRange));
1498}
1499
1500namespace detail {
1501
1502template <typename F, typename Tuple, std::size_t... I>
1503decltype(auto) apply_tuple_impl(F &&f, Tuple &&t, std::index_sequence<I...>) {
1504 return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
1505}
1506
1507} // end namespace detail
1508
1509/// Given an input tuple (a1, a2, ..., an), pass the arguments of the
1510/// tuple variadically to f as if by calling f(a1, a2, ..., an) and
1511/// return the result.
1512template <typename F, typename Tuple>
1513decltype(auto) apply_tuple(F &&f, Tuple &&t) {
1514 using Indices = std::make_index_sequence<
1515 std::tuple_size<typename std::decay<Tuple>::type>::value>;
1516
1517 return detail::apply_tuple_impl(std::forward<F>(f), std::forward<Tuple>(t),
1518 Indices{});
1519}
1520
1521/// Return true if the sequence [Begin, End) has exactly N items. Runs in O(N)
1522/// time. Not meant for use with random-access iterators.
1523template <typename IterTy>
1524bool hasNItems(
1525 IterTy &&Begin, IterTy &&End, unsigned N,
1526 std::enable_if_t<
1527 !std::is_same<typename std::iterator_traits<std::remove_reference_t<
1528 decltype(Begin)>>::iterator_category,
1529 std::random_access_iterator_tag>::value,
1530 void> * = nullptr) {
1531 for (; N; --N, ++Begin)
1532 if (Begin == End)
1533 return false; // Too few.
1534 return Begin == End;
1535}
1536
1537/// Return true if the sequence [Begin, End) has N or more items. Runs in O(N)
1538/// time. Not meant for use with random-access iterators.
1539template <typename IterTy>
1540bool hasNItemsOrMore(
1541 IterTy &&Begin, IterTy &&End, unsigned N,
1542 std::enable_if_t<
1543 !std::is_same<typename std::iterator_traits<std::remove_reference_t<
1544 decltype(Begin)>>::iterator_category,
1545 std::random_access_iterator_tag>::value,
1546 void> * = nullptr) {
1547 for (; N; --N, ++Begin)
1548 if (Begin == End)
1549 return false; // Too few.
1550 return true;
1551}
1552
1553/// Returns a raw pointer that represents the same address as the argument.
1554///
1555/// This implementation can be removed once we move to C++20 where it's defined
1556/// as std::to_addres().
1557///
1558/// The std::pointer_traits<>::to_address(p) variations of these overloads has
1559/// not been implemented.
1560template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); }
1561template <class T> constexpr T *to_address(T *P) { return P; }
1562
1563} // end namespace llvm
1564
1565#endif // LLVM_ADT_STLEXTRAS_H

/usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/bits/stl_algo.h

1// Algorithm implementation -*- C++ -*-
2
3// Copyright (C) 2001-2016 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the
7// terms of the GNU General Public License as published by the
8// Free Software Foundation; either version 3, or (at your option)
9// any later version.
10
11// This library is distributed in the hope that it will be useful,
12// but WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14// GNU General Public License for more details.
15
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
19
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
24
25/*
26 *
27 * Copyright (c) 1994
28 * Hewlett-Packard Company
29 *
30 * Permission to use, copy, modify, distribute and sell this software
31 * and its documentation for any purpose is hereby granted without fee,
32 * provided that the above copyright notice appear in all copies and
33 * that both that copyright notice and this permission notice appear
34 * in supporting documentation. Hewlett-Packard Company makes no
35 * representations about the suitability of this software for any
36 * purpose. It is provided "as is" without express or implied warranty.
37 *
38 *
39 * Copyright (c) 1996
40 * Silicon Graphics Computer Systems, Inc.
41 *
42 * Permission to use, copy, modify, distribute and sell this software
43 * and its documentation for any purpose is hereby granted without fee,
44 * provided that the above copyright notice appear in all copies and
45 * that both that copyright notice and this permission notice appear
46 * in supporting documentation. Silicon Graphics makes no
47 * representations about the suitability of this software for any
48 * purpose. It is provided "as is" without express or implied warranty.
49 */
50
51/** @file bits/stl_algo.h
52 * This is an internal header file, included by other library headers.
53 * Do not attempt to use it directly. @headername{algorithm}
54 */
55
56#ifndef _STL_ALGO_H1
57#define _STL_ALGO_H1 1
58
59#include <cstdlib> // for rand
60#include <bits/algorithmfwd.h>
61#include <bits/stl_heap.h>
62#include <bits/stl_tempbuf.h> // for _Temporary_buffer
63#include <bits/predefined_ops.h>
64
65#if __cplusplus201402L >= 201103L
66#include <bits/uniform_int_dist.h>
67#endif
68
69// See concept_check.h for the __glibcxx_*_requires macros.
70
71namespace std _GLIBCXX_VISIBILITY(default)__attribute__ ((__visibility__ ("default")))
72{
73_GLIBCXX_BEGIN_NAMESPACE_VERSION
74
75 /// Swaps the median value of *__a, *__b and *__c under __comp to *__result
76 template<typename _Iterator, typename _Compare>
77 void
78 __move_median_to_first(_Iterator __result,_Iterator __a, _Iterator __b,
79 _Iterator __c, _Compare __comp)
80 {
81 if (__comp(__a, __b))
82 {
83 if (__comp(__b, __c))
84 std::iter_swap(__result, __b);
85 else if (__comp(__a, __c))
86 std::iter_swap(__result, __c);
87 else
88 std::iter_swap(__result, __a);
89 }
90 else if (__comp(__a, __c))
91 std::iter_swap(__result, __a);
92 else if (__comp(__b, __c))
93 std::iter_swap(__result, __c);
94 else
95 std::iter_swap(__result, __b);
96 }
97
98 /// This is an overload used by find algos for the Input Iterator case.
99 template<typename _InputIterator, typename _Predicate>
100 inline _InputIterator
101 __find_if(_InputIterator __first, _InputIterator __last,
102 _Predicate __pred, input_iterator_tag)
103 {
104 while (__first != __last && !__pred(__first))
105 ++__first;
106 return __first;
107 }
108
109 /// This is an overload used by find algos for the RAI case.
110 template<typename _RandomAccessIterator, typename _Predicate>
111 _RandomAccessIterator
112 __find_if(_RandomAccessIterator __first, _RandomAccessIterator __last,
113 _Predicate __pred, random_access_iterator_tag)
114 {
115 typename iterator_traits<_RandomAccessIterator>::difference_type
116 __trip_count = (__last - __first) >> 2;
117
118 for (; __trip_count > 0; --__trip_count)
119 {
120 if (__pred(__first))
121 return __first;
122 ++__first;
123
124 if (__pred(__first))
125 return __first;
126 ++__first;
127
128 if (__pred(__first))
129 return __first;
130 ++__first;
131
132 if (__pred(__first))
133 return __first;
134 ++__first;
135 }
136
137 switch (__last - __first)
138 {
139 case 3:
140 if (__pred(__first))
141 return __first;
142 ++__first;
143 case 2:
144 if (__pred(__first))
145 return __first;
146 ++__first;
147 case 1:
148 if (__pred(__first))
149 return __first;
150 ++__first;
151 case 0:
152 default:
153 return __last;
154 }
155 }
156
157 template<typename _Iterator, typename _Predicate>
158 inline _Iterator
159 __find_if(_Iterator __first, _Iterator __last, _Predicate __pred)
160 {
161 return __find_if(__first, __last, __pred,
162 std::__iterator_category(__first));
163 }
164
165 /// Provided for stable_partition to use.
166 template<typename _InputIterator, typename _Predicate>
167 inline _InputIterator
168 __find_if_not(_InputIterator __first, _InputIterator __last,
169 _Predicate __pred)
170 {
171 return std::__find_if(__first, __last,
172 __gnu_cxx::__ops::__negate(__pred),
173 std::__iterator_category(__first));
174 }
175
176 /// Like find_if_not(), but uses and updates a count of the
177 /// remaining range length instead of comparing against an end
178 /// iterator.
179 template<typename _InputIterator, typename _Predicate, typename _Distance>
180 _InputIterator
181 __find_if_not_n(_InputIterator __first, _Distance& __len, _Predicate __pred)
182 {
183 for (; __len; --__len, ++__first)
184 if (!__pred(__first))
185 break;
186 return __first;
187 }
188
189 // set_difference
190 // set_intersection
191 // set_symmetric_difference
192 // set_union
193 // for_each
194 // find
195 // find_if
196 // find_first_of
197 // adjacent_find
198 // count
199 // count_if
200 // search
201
202 template<typename _ForwardIterator1, typename _ForwardIterator2,
203 typename _BinaryPredicate>
204 _ForwardIterator1
205 __search(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
206 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
207 _BinaryPredicate __predicate)
208 {
209 // Test for empty ranges
210 if (__first1 == __last1 || __first2 == __last2)
211 return __first1;
212
213 // Test for a pattern of length 1.
214 _ForwardIterator2 __p1(__first2);
215 if (++__p1 == __last2)
216 return std::__find_if(__first1, __last1,
217 __gnu_cxx::__ops::__iter_comp_iter(__predicate, __first2));
218
219 // General case.
220 _ForwardIterator2 __p;
221 _ForwardIterator1 __current = __first1;
222
223 for (;;)
224 {
225 __first1 =
226 std::__find_if(__first1, __last1,
227 __gnu_cxx::__ops::__iter_comp_iter(__predicate, __first2));
228
229 if (__first1 == __last1)
230 return __last1;
231
232 __p = __p1;
233 __current = __first1;
234 if (++__current == __last1)
235 return __last1;
236
237 while (__predicate(__current, __p))
238 {
239 if (++__p == __last2)
240 return __first1;
241 if (++__current == __last1)
242 return __last1;
243 }
244 ++__first1;
245 }
246 return __first1;
247 }
248
249 // search_n
250
251 /**
252 * This is an helper function for search_n overloaded for forward iterators.
253 */
254 template<typename _ForwardIterator, typename _Integer,
255 typename _UnaryPredicate>
256 _ForwardIterator
257 __search_n_aux(_ForwardIterator __first, _ForwardIterator __last,
258 _Integer __count, _UnaryPredicate __unary_pred,
259 std::forward_iterator_tag)
260 {
261 __first = std::__find_if(__first, __last, __unary_pred);
262 while (__first != __last)
263 {
264 typename iterator_traits<_ForwardIterator>::difference_type
265 __n = __count;
266 _ForwardIterator __i = __first;
267 ++__i;
268 while (__i != __last && __n != 1 && __unary_pred(__i))
269 {
270 ++__i;
271 --__n;
272 }
273 if (__n == 1)
274 return __first;
275 if (__i == __last)
276 return __last;
277 __first = std::__find_if(++__i, __last, __unary_pred);
278 }
279 return __last;
280 }
281
282 /**
283 * This is an helper function for search_n overloaded for random access
284 * iterators.
285 */
286 template<typename _RandomAccessIter, typename _Integer,
287 typename _UnaryPredicate>
288 _RandomAccessIter
289 __search_n_aux(_RandomAccessIter __first, _RandomAccessIter __last,
290 _Integer __count, _UnaryPredicate __unary_pred,
291 std::random_access_iterator_tag)
292 {
293 typedef typename std::iterator_traits<_RandomAccessIter>::difference_type
294 _DistanceType;
295
296 _DistanceType __tailSize = __last - __first;
297 _DistanceType __remainder = __count;
298
299 while (__remainder <= __tailSize) // the main loop...
300 {
301 __first += __remainder;
302 __tailSize -= __remainder;
303 // __first here is always pointing to one past the last element of
304 // next possible match.
305 _RandomAccessIter __backTrack = __first;
306 while (__unary_pred(--__backTrack))
307 {
308 if (--__remainder == 0)
309 return (__first - __count); // Success
310 }
311 __remainder = __count + 1 - (__first - __backTrack);
312 }
313 return __last; // Failure
314 }
315
316 template<typename _ForwardIterator, typename _Integer,
317 typename _UnaryPredicate>
318 _ForwardIterator
319 __search_n(_ForwardIterator __first, _ForwardIterator __last,
320 _Integer __count,
321 _UnaryPredicate __unary_pred)
322 {
323 if (__count <= 0)
324 return __first;
325
326 if (__count == 1)
327 return std::__find_if(__first, __last, __unary_pred);
328
329 return std::__search_n_aux(__first, __last, __count, __unary_pred,
330 std::__iterator_category(__first));
331 }
332
333 // find_end for forward iterators.
334 template<typename _ForwardIterator1, typename _ForwardIterator2,
335 typename _BinaryPredicate>
336 _ForwardIterator1
337 __find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
338 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
339 forward_iterator_tag, forward_iterator_tag,
340 _BinaryPredicate __comp)
341 {
342 if (__first2 == __last2)
343 return __last1;
344
345 _ForwardIterator1 __result = __last1;
346 while (1)
347 {
348 _ForwardIterator1 __new_result
349 = std::__search(__first1, __last1, __first2, __last2, __comp);
350 if (__new_result == __last1)
351 return __result;
352 else
353 {
354 __result = __new_result;
355 __first1 = __new_result;
356 ++__first1;
357 }
358 }
359 }
360
361 // find_end for bidirectional iterators (much faster).
362 template<typename _BidirectionalIterator1, typename _BidirectionalIterator2,
363 typename _BinaryPredicate>
364 _BidirectionalIterator1
365 __find_end(_BidirectionalIterator1 __first1,
366 _BidirectionalIterator1 __last1,
367 _BidirectionalIterator2 __first2,
368 _BidirectionalIterator2 __last2,
369 bidirectional_iterator_tag, bidirectional_iterator_tag,
370 _BinaryPredicate __comp)
371 {
372 // concept requirements
373 __glibcxx_function_requires(_BidirectionalIteratorConcept<
374 _BidirectionalIterator1>)
375 __glibcxx_function_requires(_BidirectionalIteratorConcept<
376 _BidirectionalIterator2>)
377
378 typedef reverse_iterator<_BidirectionalIterator1> _RevIterator1;
379 typedef reverse_iterator<_BidirectionalIterator2> _RevIterator2;
380
381 _RevIterator1 __rlast1(__first1);
382 _RevIterator2 __rlast2(__first2);
383 _RevIterator1 __rresult = std::__search(_RevIterator1(__last1), __rlast1,
384 _RevIterator2(__last2), __rlast2,
385 __comp);
386
387 if (__rresult == __rlast1)
388 return __last1;
389 else
390 {
391 _BidirectionalIterator1 __result = __rresult.base();
392 std::advance(__result, -std::distance(__first2, __last2));
393 return __result;
394 }
395 }
396
397 /**
398 * @brief Find last matching subsequence in a sequence.
399 * @ingroup non_mutating_algorithms
400 * @param __first1 Start of range to search.
401 * @param __last1 End of range to search.
402 * @param __first2 Start of sequence to match.
403 * @param __last2 End of sequence to match.
404 * @return The last iterator @c i in the range
405 * @p [__first1,__last1-(__last2-__first2)) such that @c *(i+N) ==
406 * @p *(__first2+N) for each @c N in the range @p
407 * [0,__last2-__first2), or @p __last1 if no such iterator exists.
408 *
409 * Searches the range @p [__first1,__last1) for a sub-sequence that
410 * compares equal value-by-value with the sequence given by @p
411 * [__first2,__last2) and returns an iterator to the __first
412 * element of the sub-sequence, or @p __last1 if the sub-sequence
413 * is not found. The sub-sequence will be the last such
414 * subsequence contained in [__first1,__last1).
415 *
416 * Because the sub-sequence must lie completely within the range @p
417 * [__first1,__last1) it must start at a position less than @p
418 * __last1-(__last2-__first2) where @p __last2-__first2 is the
419 * length of the sub-sequence. This means that the returned
420 * iterator @c i will be in the range @p
421 * [__first1,__last1-(__last2-__first2))
422 */
423 template<typename _ForwardIterator1, typename _ForwardIterator2>
424 inline _ForwardIterator1
425 find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
426 _ForwardIterator2 __first2, _ForwardIterator2 __last2)
427 {
428 // concept requirements
429 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
430 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
431 __glibcxx_function_requires(_EqualOpConcept<
432 typename iterator_traits<_ForwardIterator1>::value_type,
433 typename iterator_traits<_ForwardIterator2>::value_type>)
434 __glibcxx_requires_valid_range(__first1, __last1);
435 __glibcxx_requires_valid_range(__first2, __last2);
436
437 return std::__find_end(__first1, __last1, __first2, __last2,
438 std::__iterator_category(__first1),
439 std::__iterator_category(__first2),
440 __gnu_cxx::__ops::__iter_equal_to_iter());
441 }
442
443 /**
444 * @brief Find last matching subsequence in a sequence using a predicate.
445 * @ingroup non_mutating_algorithms
446 * @param __first1 Start of range to search.
447 * @param __last1 End of range to search.
448 * @param __first2 Start of sequence to match.
449 * @param __last2 End of sequence to match.
450 * @param __comp The predicate to use.
451 * @return The last iterator @c i in the range @p
452 * [__first1,__last1-(__last2-__first2)) such that @c
453 * predicate(*(i+N), @p (__first2+N)) is true for each @c N in the
454 * range @p [0,__last2-__first2), or @p __last1 if no such iterator
455 * exists.
456 *
457 * Searches the range @p [__first1,__last1) for a sub-sequence that
458 * compares equal value-by-value with the sequence given by @p
459 * [__first2,__last2) using comp as a predicate and returns an
460 * iterator to the first element of the sub-sequence, or @p __last1
461 * if the sub-sequence is not found. The sub-sequence will be the
462 * last such subsequence contained in [__first,__last1).
463 *
464 * Because the sub-sequence must lie completely within the range @p
465 * [__first1,__last1) it must start at a position less than @p
466 * __last1-(__last2-__first2) where @p __last2-__first2 is the
467 * length of the sub-sequence. This means that the returned
468 * iterator @c i will be in the range @p
469 * [__first1,__last1-(__last2-__first2))
470 */
471 template<typename _ForwardIterator1, typename _ForwardIterator2,
472 typename _BinaryPredicate>
473 inline _ForwardIterator1
474 find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
475 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
476 _BinaryPredicate __comp)
477 {
478 // concept requirements
479 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
480 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
481 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
482 typename iterator_traits<_ForwardIterator1>::value_type,
483 typename iterator_traits<_ForwardIterator2>::value_type>)
484 __glibcxx_requires_valid_range(__first1, __last1);
485 __glibcxx_requires_valid_range(__first2, __last2);
486
487 return std::__find_end(__first1, __last1, __first2, __last2,
488 std::__iterator_category(__first1),
489 std::__iterator_category(__first2),
490 __gnu_cxx::__ops::__iter_comp_iter(__comp));
491 }
492
493#if __cplusplus201402L >= 201103L
494 /**
495 * @brief Checks that a predicate is true for all the elements
496 * of a sequence.
497 * @ingroup non_mutating_algorithms
498 * @param __first An input iterator.
499 * @param __last An input iterator.
500 * @param __pred A predicate.
501 * @return True if the check is true, false otherwise.
502 *
503 * Returns true if @p __pred is true for each element in the range
504 * @p [__first,__last), and false otherwise.
505 */
506 template<typename _InputIterator, typename _Predicate>
507 inline bool
508 all_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
509 { return __last == std::find_if_not(__first, __last, __pred); }
510
511 /**
512 * @brief Checks that a predicate is false for all the elements
513 * of a sequence.
514 * @ingroup non_mutating_algorithms
515 * @param __first An input iterator.
516 * @param __last An input iterator.
517 * @param __pred A predicate.
518 * @return True if the check is true, false otherwise.
519 *
520 * Returns true if @p __pred is false for each element in the range
521 * @p [__first,__last), and false otherwise.
522 */
523 template<typename _InputIterator, typename _Predicate>
524 inline bool
525 none_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
526 { return __last == _GLIBCXX_STD_Astd::find_if(__first, __last, __pred); }
36
Returning the value 1, which participates in a condition later
527
528 /**
529 * @brief Checks that a predicate is false for at least an element
530 * of a sequence.
531 * @ingroup non_mutating_algorithms
532 * @param __first An input iterator.
533 * @param __last An input iterator.
534 * @param __pred A predicate.
535 * @return True if the check is true, false otherwise.
536 *
537 * Returns true if an element exists in the range @p
538 * [__first,__last) such that @p __pred is true, and false
539 * otherwise.
540 */
541 template<typename _InputIterator, typename _Predicate>
542 inline bool
543 any_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
544 { return !std::none_of(__first, __last, __pred); }
35
Calling 'none_of<llvm::SDValue *, (lambda at /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp:34578:30)>'
37
Returning from 'none_of<llvm::SDValue *, (lambda at /build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/lib/Target/X86/X86ISelLowering.cpp:34578:30)>'
38
Returning zero, which participates in a condition later
545
546 /**
547 * @brief Find the first element in a sequence for which a
548 * predicate is false.
549 * @ingroup non_mutating_algorithms
550 * @param __first An input iterator.
551 * @param __last An input iterator.
552 * @param __pred A predicate.
553 * @return The first iterator @c i in the range @p [__first,__last)
554 * such that @p __pred(*i) is false, or @p __last if no such iterator exists.
555 */
556 template<typename _InputIterator, typename _Predicate>
557 inline _InputIterator
558 find_if_not(_InputIterator __first, _InputIterator __last,
559 _Predicate __pred)
560 {
561 // concept requirements
562 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
563 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
564 typename iterator_traits<_InputIterator>::value_type>)
565 __glibcxx_requires_valid_range(__first, __last);
566 return std::__find_if_not(__first, __last,
567 __gnu_cxx::__ops::__pred_iter(__pred));
568 }
569
570 /**
571 * @brief Checks whether the sequence is partitioned.
572 * @ingroup mutating_algorithms
573 * @param __first An input iterator.
574 * @param __last An input iterator.
575 * @param __pred A predicate.
576 * @return True if the range @p [__first,__last) is partioned by @p __pred,
577 * i.e. if all elements that satisfy @p __pred appear before those that
578 * do not.
579 */
580 template<typename _InputIterator, typename _Predicate>
581 inline bool
582 is_partitioned(_InputIterator __first, _InputIterator __last,
583 _Predicate __pred)
584 {
585 __first = std::find_if_not(__first, __last, __pred);
586 return std::none_of(__first, __last, __pred);
587 }
588
589 /**
590 * @brief Find the partition point of a partitioned range.
591 * @ingroup mutating_algorithms
592 * @param __first An iterator.
593 * @param __last Another iterator.
594 * @param __pred A predicate.
595 * @return An iterator @p mid such that @p all_of(__first, mid, __pred)
596 * and @p none_of(mid, __last, __pred) are both true.
597 */
598 template<typename _ForwardIterator, typename _Predicate>
599 _ForwardIterator
600 partition_point(_ForwardIterator __first, _ForwardIterator __last,
601 _Predicate __pred)
602 {
603 // concept requirements
604 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
605 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
606 typename iterator_traits<_ForwardIterator>::value_type>)
607
608 // A specific debug-mode test will be necessary...
609 __glibcxx_requires_valid_range(__first, __last);
610
611 typedef typename iterator_traits<_ForwardIterator>::difference_type
612 _DistanceType;
613
614 _DistanceType __len = std::distance(__first, __last);
615 _DistanceType __half;
616 _ForwardIterator __middle;
617
618 while (__len > 0)
619 {
620 __half = __len >> 1;
621 __middle = __first;
622 std::advance(__middle, __half);
623 if (__pred(*__middle))
624 {
625 __first = __middle;
626 ++__first;
627 __len = __len - __half - 1;
628 }
629 else
630 __len = __half;
631 }
632 return __first;
633 }
634#endif
635
636 template<typename _InputIterator, typename _OutputIterator,
637 typename _Predicate>
638 _OutputIterator
639 __remove_copy_if(_InputIterator __first, _InputIterator __last,
640 _OutputIterator __result, _Predicate __pred)
641 {
642 for (; __first != __last; ++__first)
643 if (!__pred(__first))
644 {
645 *__result = *__first;
646 ++__result;
647 }
648 return __result;
649 }
650
651 /**
652 * @brief Copy a sequence, removing elements of a given value.
653 * @ingroup mutating_algorithms
654 * @param __first An input iterator.
655 * @param __last An input iterator.
656 * @param __result An output iterator.
657 * @param __value The value to be removed.
658 * @return An iterator designating the end of the resulting sequence.
659 *
660 * Copies each element in the range @p [__first,__last) not equal
661 * to @p __value to the range beginning at @p __result.
662 * remove_copy() is stable, so the relative order of elements that
663 * are copied is unchanged.
664 */
665 template<typename _InputIterator, typename _OutputIterator, typename _Tp>
666 inline _OutputIterator
667 remove_copy(_InputIterator __first, _InputIterator __last,
668 _OutputIterator __result, const _Tp& __value)
669 {
670 // concept requirements
671 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
672 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
673 typename iterator_traits<_InputIterator>::value_type>)
674 __glibcxx_function_requires(_EqualOpConcept<
675 typename iterator_traits<_InputIterator>::value_type, _Tp>)
676 __glibcxx_requires_valid_range(__first, __last);
677
678 return std::__remove_copy_if(__first, __last, __result,
679 __gnu_cxx::__ops::__iter_equals_val(__value));
680 }
681
682 /**
683 * @brief Copy a sequence, removing elements for which a predicate is true.
684 * @ingroup mutating_algorithms
685 * @param __first An input iterator.
686 * @param __last An input iterator.
687 * @param __result An output iterator.
688 * @param __pred A predicate.
689 * @return An iterator designating the end of the resulting sequence.
690 *
691 * Copies each element in the range @p [__first,__last) for which
692 * @p __pred returns false to the range beginning at @p __result.
693 *
694 * remove_copy_if() is stable, so the relative order of elements that are
695 * copied is unchanged.
696 */
697 template<typename _InputIterator, typename _OutputIterator,
698 typename _Predicate>
699 inline _OutputIterator
700 remove_copy_if(_InputIterator __first, _InputIterator __last,
701 _OutputIterator __result, _Predicate __pred)
702 {
703 // concept requirements
704 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
705 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
706 typename iterator_traits<_InputIterator>::value_type>)
707 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
708 typename iterator_traits<_InputIterator>::value_type>)
709 __glibcxx_requires_valid_range(__first, __last);
710
711 return std::__remove_copy_if(__first, __last, __result,
712 __gnu_cxx::__ops::__pred_iter(__pred));
713 }
714
715#if __cplusplus201402L >= 201103L
716 /**
717 * @brief Copy the elements of a sequence for which a predicate is true.
718 * @ingroup mutating_algorithms
719 * @param __first An input iterator.
720 * @param __last An input iterator.
721 * @param __result An output iterator.
722 * @param __pred A predicate.
723 * @return An iterator designating the end of the resulting sequence.
724 *
725 * Copies each element in the range @p [__first,__last) for which
726 * @p __pred returns true to the range beginning at @p __result.
727 *
728 * copy_if() is stable, so the relative order of elements that are
729 * copied is unchanged.
730 */
731 template<typename _InputIterator, typename _OutputIterator,
732 typename _Predicate>
733 _OutputIterator
734 copy_if(_InputIterator __first, _InputIterator __last,
735 _OutputIterator __result, _Predicate __pred)
736 {
737 // concept requirements
738 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
739 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
740 typename iterator_traits<_InputIterator>::value_type>)
741 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
742 typename iterator_traits<_InputIterator>::value_type>)
743 __glibcxx_requires_valid_range(__first, __last);
744
745 for (; __first != __last; ++__first)
746 if (__pred(*__first))
747 {
748 *__result = *__first;
749 ++__result;
750 }
751 return __result;
752 }
753
754 template<typename _InputIterator, typename _Size, typename _OutputIterator>
755 _OutputIterator
756 __copy_n(_InputIterator __first, _Size __n,
757 _OutputIterator __result, input_iterator_tag)
758 {
759 if (__n > 0)
760 {
761 while (true)
762 {
763 *__result = *__first;
764 ++__result;
765 if (--__n > 0)
766 ++__first;
767 else
768 break;
769 }
770 }
771 return __result;
772 }
773
774 template<typename _RandomAccessIterator, typename _Size,
775 typename _OutputIterator>
776 inline _OutputIterator
777 __copy_n(_RandomAccessIterator __first, _Size __n,
778 _OutputIterator __result, random_access_iterator_tag)
779 { return std::copy(__first, __first + __n, __result); }
780
781 /**
782 * @brief Copies the range [first,first+n) into [result,result+n).
783 * @ingroup mutating_algorithms
784 * @param __first An input iterator.
785 * @param __n The number of elements to copy.
786 * @param __result An output iterator.
787 * @return result+n.
788 *
789 * This inline function will boil down to a call to @c memmove whenever
790 * possible. Failing that, if random access iterators are passed, then the
791 * loop count will be known (and therefore a candidate for compiler
792 * optimizations such as unrolling).
793 */
794 template<typename _InputIterator, typename _Size, typename _OutputIterator>
795 inline _OutputIterator
796 copy_n(_InputIterator __first, _Size __n, _OutputIterator __result)
797 {
798 // concept requirements
799 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
800 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
801 typename iterator_traits<_InputIterator>::value_type>)
802
803 return std::__copy_n(__first, __n, __result,
804 std::__iterator_category(__first));
805 }
806
807 /**
808 * @brief Copy the elements of a sequence to separate output sequences
809 * depending on the truth value of a predicate.
810 * @ingroup mutating_algorithms
811 * @param __first An input iterator.
812 * @param __last An input iterator.
813 * @param __out_true An output iterator.
814 * @param __out_false An output iterator.
815 * @param __pred A predicate.
816 * @return A pair designating the ends of the resulting sequences.
817 *
818 * Copies each element in the range @p [__first,__last) for which
819 * @p __pred returns true to the range beginning at @p out_true
820 * and each element for which @p __pred returns false to @p __out_false.
821 */
822 template<typename _InputIterator, typename _OutputIterator1,
823 typename _OutputIterator2, typename _Predicate>
824 pair<_OutputIterator1, _OutputIterator2>
825 partition_copy(_InputIterator __first, _InputIterator __last,
826 _OutputIterator1 __out_true, _OutputIterator2 __out_false,
827 _Predicate __pred)
828 {
829 // concept requirements
830 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
831 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator1,
832 typename iterator_traits<_InputIterator>::value_type>)
833 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator2,
834 typename iterator_traits<_InputIterator>::value_type>)
835 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
836 typename iterator_traits<_InputIterator>::value_type>)
837 __glibcxx_requires_valid_range(__first, __last);
838
839 for (; __first != __last; ++__first)
840 if (__pred(*__first))
841 {
842 *__out_true = *__first;
843 ++__out_true;
844 }
845 else
846 {
847 *__out_false = *__first;
848 ++__out_false;
849 }
850
851 return pair<_OutputIterator1, _OutputIterator2>(__out_true, __out_false);
852 }
853#endif
854
855 template<typename _ForwardIterator, typename _Predicate>
856 _ForwardIterator
857 __remove_if(_ForwardIterator __first, _ForwardIterator __last,
858 _Predicate __pred)
859 {
860 __first = std::__find_if(__first, __last, __pred);
861 if (__first == __last)
862 return __first;
863 _ForwardIterator __result = __first;
864 ++__first;
865 for (; __first != __last; ++__first)
866 if (!__pred(__first))
867 {
868 *__result = _GLIBCXX_MOVE(*__first)std::move(*__first);
869 ++__result;
870 }
871 return __result;
872 }
873
874 /**
875 * @brief Remove elements from a sequence.
876 * @ingroup mutating_algorithms
877 * @param __first An input iterator.
878 * @param __last An input iterator.
879 * @param __value The value to be removed.
880 * @return An iterator designating the end of the resulting sequence.
881 *
882 * All elements equal to @p __value are removed from the range
883 * @p [__first,__last).
884 *
885 * remove() is stable, so the relative order of elements that are
886 * not removed is unchanged.
887 *
888 * Elements between the end of the resulting sequence and @p __last
889 * are still present, but their value is unspecified.
890 */
891 template<typename _ForwardIterator, typename _Tp>
892 inline _ForwardIterator
893 remove(_ForwardIterator __first, _ForwardIterator __last,
894 const _Tp& __value)
895 {
896 // concept requirements
897 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
898 _ForwardIterator>)
899 __glibcxx_function_requires(_EqualOpConcept<
900 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
901 __glibcxx_requires_valid_range(__first, __last);
902
903 return std::__remove_if(__first, __last,
904 __gnu_cxx::__ops::__iter_equals_val(__value));
905 }
906
907 /**
908 * @brief Remove elements from a sequence using a predicate.
909 * @ingroup mutating_algorithms
910 * @param __first A forward iterator.
911 * @param __last A forward iterator.
912 * @param __pred A predicate.
913 * @return An iterator designating the end of the resulting sequence.
914 *
915 * All elements for which @p __pred returns true are removed from the range
916 * @p [__first,__last).
917 *
918 * remove_if() is stable, so the relative order of elements that are
919 * not removed is unchanged.
920 *
921 * Elements between the end of the resulting sequence and @p __last
922 * are still present, but their value is unspecified.
923 */
924 template<typename _ForwardIterator, typename _Predicate>
925 inline _ForwardIterator
926 remove_if(_ForwardIterator __first, _ForwardIterator __last,
927 _Predicate __pred)
928 {
929 // concept requirements
930 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
931 _ForwardIterator>)
932 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
933 typename iterator_traits<_ForwardIterator>::value_type>)
934 __glibcxx_requires_valid_range(__first, __last);
935
936 return std::__remove_if(__first, __last,
937 __gnu_cxx::__ops::__pred_iter(__pred));
938 }
939
940 template<typename _ForwardIterator, typename _BinaryPredicate>
941 _ForwardIterator
942 __adjacent_find(_ForwardIterator __first, _ForwardIterator __last,
943 _BinaryPredicate __binary_pred)
944 {
945 if (__first == __last)
946 return __last;
947 _ForwardIterator __next = __first;
948 while (++__next != __last)
949 {
950 if (__binary_pred(__first, __next))
951 return __first;
952 __first = __next;
953 }
954 return __last;
955 }
956
957 template<typename _ForwardIterator, typename _BinaryPredicate>
958 _ForwardIterator
959 __unique(_ForwardIterator __first, _ForwardIterator __last,
960 _BinaryPredicate __binary_pred)
961 {
962 // Skip the beginning, if already unique.
963 __first = std::__adjacent_find(__first, __last, __binary_pred);
964 if (__first == __last)
965 return __last;
966
967 // Do the real copy work.
968 _ForwardIterator __dest = __first;
969 ++__first;
970 while (++__first != __last)
971 if (!__binary_pred(__dest, __first))
972 *++__dest = _GLIBCXX_MOVE(*__first)std::move(*__first);
973 return ++__dest;
974 }
975
976 /**
977 * @brief Remove consecutive duplicate values from a sequence.
978 * @ingroup mutating_algorithms
979 * @param __first A forward iterator.
980 * @param __last A forward iterator.
981 * @return An iterator designating the end of the resulting sequence.
982 *
983 * Removes all but the first element from each group of consecutive
984 * values that compare equal.
985 * unique() is stable, so the relative order of elements that are
986 * not removed is unchanged.
987 * Elements between the end of the resulting sequence and @p __last
988 * are still present, but their value is unspecified.
989 */
990 template<typename _ForwardIterator>
991 inline _ForwardIterator
992 unique(_ForwardIterator __first, _ForwardIterator __last)
993 {
994 // concept requirements
995 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
996 _ForwardIterator>)
997 __glibcxx_function_requires(_EqualityComparableConcept<
998 typename iterator_traits<_ForwardIterator>::value_type>)
999 __glibcxx_requires_valid_range(__first, __last);
1000
1001 return std::__unique(__first, __last,
1002 __gnu_cxx::__ops::__iter_equal_to_iter());
1003 }
1004
1005 /**
1006 * @brief Remove consecutive values from a sequence using a predicate.
1007 * @ingroup mutating_algorithms
1008 * @param __first A forward iterator.
1009 * @param __last A forward iterator.
1010 * @param __binary_pred A binary predicate.
1011 * @return An iterator designating the end of the resulting sequence.
1012 *
1013 * Removes all but the first element from each group of consecutive
1014 * values for which @p __binary_pred returns true.
1015 * unique() is stable, so the relative order of elements that are
1016 * not removed is unchanged.
1017 * Elements between the end of the resulting sequence and @p __last
1018 * are still present, but their value is unspecified.
1019 */
1020 template<typename _ForwardIterator, typename _BinaryPredicate>
1021 inline _ForwardIterator
1022 unique(_ForwardIterator __first, _ForwardIterator __last,
1023 _BinaryPredicate __binary_pred)
1024 {
1025 // concept requirements
1026 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
1027 _ForwardIterator>)
1028 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
1029 typename iterator_traits<_ForwardIterator>::value_type,
1030 typename iterator_traits<_ForwardIterator>::value_type>)
1031 __glibcxx_requires_valid_range(__first, __last);
1032
1033 return std::__unique(__first, __last,
1034 __gnu_cxx::__ops::__iter_comp_iter(__binary_pred));
1035 }
1036
1037 /**
1038 * This is an uglified
1039 * unique_copy(_InputIterator, _InputIterator, _OutputIterator,
1040 * _BinaryPredicate)
1041 * overloaded for forward iterators and output iterator as result.
1042 */
1043 template<typename _ForwardIterator, typename _OutputIterator,
1044 typename _BinaryPredicate>
1045 _OutputIterator
1046 __unique_copy(_ForwardIterator __first, _ForwardIterator __last,
1047 _OutputIterator __result, _BinaryPredicate __binary_pred,
1048 forward_iterator_tag, output_iterator_tag)
1049 {
1050 // concept requirements -- iterators already checked
1051 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
1052 typename iterator_traits<_ForwardIterator>::value_type,
1053 typename iterator_traits<_ForwardIterator>::value_type>)
1054
1055 _ForwardIterator __next = __first;
1056 *__result = *__first;
1057 while (++__next != __last)
1058 if (!__binary_pred(__first, __next))
1059 {
1060 __first = __next;
1061 *++__result = *__first;
1062 }
1063 return ++__result;
1064 }
1065
1066 /**
1067 * This is an uglified
1068 * unique_copy(_InputIterator, _InputIterator, _OutputIterator,
1069 * _BinaryPredicate)
1070 * overloaded for input iterators and output iterator as result.
1071 */
1072 template<typename _InputIterator, typename _OutputIterator,
1073 typename _BinaryPredicate>
1074 _OutputIterator
1075 __unique_copy(_InputIterator __first, _InputIterator __last,
1076 _OutputIterator __result, _BinaryPredicate __binary_pred,
1077 input_iterator_tag, output_iterator_tag)
1078 {
1079 // concept requirements -- iterators already checked
1080 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
1081 typename iterator_traits<_InputIterator>::value_type,
1082 typename iterator_traits<_InputIterator>::value_type>)
1083
1084 typename iterator_traits<_InputIterator>::value_type __value = *__first;
1085 __decltype(__gnu_cxx::__ops::__iter_comp_val(__binary_pred))
1086 __rebound_pred
1087 = __gnu_cxx::__ops::__iter_comp_val(__binary_pred);
1088 *__result = __value;
1089 while (++__first != __last)
1090 if (!__rebound_pred(__first, __value))
1091 {
1092 __value = *__first;
1093 *++__result = __value;
1094 }
1095 return ++__result;
1096 }
1097
1098 /**
1099 * This is an uglified
1100 * unique_copy(_InputIterator, _InputIterator, _OutputIterator,
1101 * _BinaryPredicate)
1102 * overloaded for input iterators and forward iterator as result.
1103 */
1104 template<typename _InputIterator, typename _ForwardIterator,
1105 typename _BinaryPredicate>
1106 _ForwardIterator
1107 __unique_copy(_InputIterator __first, _InputIterator __last,
1108 _ForwardIterator __result, _BinaryPredicate __binary_pred,
1109 input_iterator_tag, forward_iterator_tag)
1110 {
1111 // concept requirements -- iterators already checked
1112 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
1113 typename iterator_traits<_ForwardIterator>::value_type,
1114 typename iterator_traits<_InputIterator>::value_type>)
1115 *__result = *__first;
1116 while (++__first != __last)
1117 if (!__binary_pred(__result, __first))
1118 *++__result = *__first;
1119 return ++__result;
1120 }
1121
1122 /**
1123 * This is an uglified reverse(_BidirectionalIterator,
1124 * _BidirectionalIterator)
1125 * overloaded for bidirectional iterators.
1126 */
1127 template<typename _BidirectionalIterator>
1128 void
1129 __reverse(_BidirectionalIterator __first, _BidirectionalIterator __last,
1130 bidirectional_iterator_tag)
1131 {
1132 while (true)
1133 if (__first == __last || __first == --__last)
1134 return;
1135 else
1136 {
1137 std::iter_swap(__first, __last);
1138 ++__first;
1139 }
1140 }
1141
1142 /**
1143 * This is an uglified reverse(_BidirectionalIterator,
1144 * _BidirectionalIterator)
1145 * overloaded for random access iterators.
1146 */
1147 template<typename _RandomAccessIterator>
1148 void
1149 __reverse(_RandomAccessIterator __first, _RandomAccessIterator __last,
1150 random_access_iterator_tag)
1151 {
1152 if (__first == __last)
1153 return;
1154 --__last;
1155 while (__first < __last)
1156 {
1157 std::iter_swap(__first, __last);
1158 ++__first;
1159 --__last;
1160 }
1161 }
1162
1163 /**
1164 * @brief Reverse a sequence.
1165 * @ingroup mutating_algorithms
1166 * @param __first A bidirectional iterator.
1167 * @param __last A bidirectional iterator.
1168 * @return reverse() returns no value.
1169 *
1170 * Reverses the order of the elements in the range @p [__first,__last),
1171 * so that the first element becomes the last etc.
1172 * For every @c i such that @p 0<=i<=(__last-__first)/2), @p reverse()
1173 * swaps @p *(__first+i) and @p *(__last-(i+1))
1174 */
1175 template<typename _BidirectionalIterator>
1176 inline void
1177 reverse(_BidirectionalIterator __first, _BidirectionalIterator __last)
1178 {
1179 // concept requirements
1180 __glibcxx_function_requires(_Mutable_BidirectionalIteratorConcept<
1181 _BidirectionalIterator>)
1182 __glibcxx_requires_valid_range(__first, __last);
1183 std::__reverse(__first, __last, std::__iterator_category(__first));
1184 }
1185
1186 /**
1187 * @brief Copy a sequence, reversing its elements.
1188 * @ingroup mutating_algorithms
1189 * @param __first A bidirectional iterator.
1190 * @param __last A bidirectional iterator.
1191 * @param __result An output iterator.
1192 * @return An iterator designating the end of the resulting sequence.
1193 *
1194 * Copies the elements in the range @p [__first,__last) to the
1195 * range @p [__result,__result+(__last-__first)) such that the
1196 * order of the elements is reversed. For every @c i such that @p
1197 * 0<=i<=(__last-__first), @p reverse_copy() performs the
1198 * assignment @p *(__result+(__last-__first)-1-i) = *(__first+i).
1199 * The ranges @p [__first,__last) and @p
1200 * [__result,__result+(__last-__first)) must not overlap.
1201 */
1202 template<typename _BidirectionalIterator, typename _OutputIterator>
1203 _OutputIterator
1204 reverse_copy(_BidirectionalIterator __first, _BidirectionalIterator __last,
1205 _OutputIterator __result)
1206 {
1207 // concept requirements
1208 __glibcxx_function_requires(_BidirectionalIteratorConcept<
1209 _BidirectionalIterator>)
1210 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
1211 typename iterator_traits<_BidirectionalIterator>::value_type>)
1212 __glibcxx_requires_valid_range(__first, __last);
1213
1214 while (__first != __last)
1215 {
1216 --__last;
1217 *__result = *__last;
1218 ++__result;
1219 }
1220 return __result;
1221 }
1222
1223 /**
1224 * This is a helper function for the rotate algorithm specialized on RAIs.
1225 * It returns the greatest common divisor of two integer values.
1226 */
1227 template<typename _EuclideanRingElement>
1228 _EuclideanRingElement
1229 __gcd(_EuclideanRingElement __m, _EuclideanRingElement __n)
1230 {
1231 while (__n != 0)
1232 {
1233 _EuclideanRingElement __t = __m % __n;
1234 __m = __n;
1235 __n = __t;
1236 }
1237 return __m;
1238 }
1239
1240 inline namespace _V2
1241 {
1242
1243 /// This is a helper function for the rotate algorithm.
1244 template<typename _ForwardIterator>
1245 _ForwardIterator
1246 __rotate(_ForwardIterator __first,
1247 _ForwardIterator __middle,
1248 _ForwardIterator __last,
1249 forward_iterator_tag)
1250 {
1251 if (__first == __middle)
1252 return __last;
1253 else if (__last == __middle)
1254 return __first;
1255
1256 _ForwardIterator __first2 = __middle;
1257 do
1258 {
1259 std::iter_swap(__first, __first2);
1260 ++__first;
1261 ++__first2;
1262 if (__first == __middle)
1263 __middle = __first2;
1264 }
1265 while (__first2 != __last);
1266
1267 _ForwardIterator __ret = __first;
1268
1269 __first2 = __middle;
1270
1271 while (__first2 != __last)
1272 {
1273 std::iter_swap(__first, __first2);
1274 ++__first;
1275 ++__first2;
1276 if (__first == __middle)
1277 __middle = __first2;
1278 else if (__first2 == __last)
1279 __first2 = __middle;
1280 }
1281 return __ret;
1282 }
1283
1284 /// This is a helper function for the rotate algorithm.
1285 template<typename _BidirectionalIterator>
1286 _BidirectionalIterator
1287 __rotate(_BidirectionalIterator __first,
1288 _BidirectionalIterator __middle,
1289 _BidirectionalIterator __last,
1290 bidirectional_iterator_tag)
1291 {
1292 // concept requirements
1293 __glibcxx_function_requires(_Mutable_BidirectionalIteratorConcept<
1294 _BidirectionalIterator>)
1295
1296 if (__first == __middle)
1297 return __last;
1298 else if (__last == __middle)
1299 return __first;
1300
1301 std::__reverse(__first, __middle, bidirectional_iterator_tag());
1302 std::__reverse(__middle, __last, bidirectional_iterator_tag());
1303
1304 while (__first != __middle && __middle != __last)
1305 {
1306 std::iter_swap(__first, --__last);
1307 ++__first;
1308 }
1309
1310 if (__first == __middle)
1311 {
1312 std::__reverse(__middle, __last, bidirectional_iterator_tag());
1313 return __last;
1314 }
1315 else
1316 {
1317 std::__reverse(__first, __middle, bidirectional_iterator_tag());
1318 return __first;
1319 }
1320 }
1321
1322 /// This is a helper function for the rotate algorithm.
1323 template<typename _RandomAccessIterator>
1324 _RandomAccessIterator
1325 __rotate(_RandomAccessIterator __first,
1326 _RandomAccessIterator __middle,
1327 _RandomAccessIterator __last,
1328 random_access_iterator_tag)
1329 {
1330 // concept requirements
1331 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
1332 _RandomAccessIterator>)
1333
1334 if (__first == __middle)
1335 return __last;
1336 else if (__last == __middle)
1337 return __first;
1338
1339 typedef typename iterator_traits<_RandomAccessIterator>::difference_type
1340 _Distance;
1341 typedef typename iterator_traits<_RandomAccessIterator>::value_type
1342 _ValueType;
1343
1344 _Distance __n = __last - __first;
1345 _Distance __k = __middle - __first;
1346
1347 if (__k == __n - __k)
1348 {
1349 std::swap_ranges(__first, __middle, __middle);
1350 return __middle;
1351 }
1352
1353 _RandomAccessIterator __p = __first;
1354 _RandomAccessIterator __ret = __first + (__last - __middle);
1355
1356 for (;;)
1357 {
1358 if (__k < __n - __k)
1359 {
1360 if (__is_pod(_ValueType) && __k == 1)
1361 {
1362 _ValueType __t = _GLIBCXX_MOVE(*__p)std::move(*__p);
1363 _GLIBCXX_MOVE3(__p + 1, __p + __n, __p)std::move(__p + 1, __p + __n, __p);
1364 *(__p + __n - 1) = _GLIBCXX_MOVE(__t)std::move(__t);
1365 return __ret;
1366 }
1367 _RandomAccessIterator __q = __p + __k;
1368 for (_Distance __i = 0; __i < __n - __k; ++ __i)
1369 {
1370 std::iter_swap(__p, __q);
1371 ++__p;
1372 ++__q;
1373 }
1374 __n %= __k;
1375 if (__n == 0)
1376 return __ret;
1377 std::swap(__n, __k);
1378 __k = __n - __k;
1379 }
1380 else
1381 {
1382 __k = __n - __k;
1383 if (__is_pod(_ValueType) && __k == 1)
1384 {
1385 _ValueType __t = _GLIBCXX_MOVE(*(__p + __n - 1))std::move(*(__p + __n - 1));
1386 _GLIBCXX_MOVE_BACKWARD3(__p, __p + __n - 1, __p + __n)std::move_backward(__p, __p + __n - 1, __p + __n);
1387 *__p = _GLIBCXX_MOVE(__t)std::move(__t);
1388 return __ret;
1389 }
1390 _RandomAccessIterator __q = __p + __n;
1391 __p = __q - __k;
1392 for (_Distance __i = 0; __i < __n - __k; ++ __i)
1393 {
1394 --__p;
1395 --__q;
1396 std::iter_swap(__p, __q);
1397 }
1398 __n %= __k;
1399 if (__n == 0)
1400 return __ret;
1401 std::swap(__n, __k);
1402 }
1403 }
1404 }
1405
1406 // _GLIBCXX_RESOLVE_LIB_DEFECTS
1407 // DR 488. rotate throws away useful information
1408 /**
1409 * @brief Rotate the elements of a sequence.
1410 * @ingroup mutating_algorithms
1411 * @param __first A forward iterator.
1412 * @param __middle A forward iterator.
1413 * @param __last A forward iterator.
1414 * @return first + (last - middle).
1415 *
1416 * Rotates the elements of the range @p [__first,__last) by
1417 * @p (__middle - __first) positions so that the element at @p __middle
1418 * is moved to @p __first, the element at @p __middle+1 is moved to
1419 * @p __first+1 and so on for each element in the range
1420 * @p [__first,__last).
1421 *
1422 * This effectively swaps the ranges @p [__first,__middle) and
1423 * @p [__middle,__last).
1424 *
1425 * Performs
1426 * @p *(__first+(n+(__last-__middle))%(__last-__first))=*(__first+n)
1427 * for each @p n in the range @p [0,__last-__first).
1428 */
1429 template<typename _ForwardIterator>
1430 inline _ForwardIterator
1431 rotate(_ForwardIterator __first, _ForwardIterator __middle,
1432 _ForwardIterator __last)
1433 {
1434 // concept requirements
1435 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
1436 _ForwardIterator>)
1437 __glibcxx_requires_valid_range(__first, __middle);
1438 __glibcxx_requires_valid_range(__middle, __last);
1439
1440 return std::__rotate(__first, __middle, __last,
1441 std::__iterator_category(__first));
1442 }
1443
1444 } // namespace _V2
1445
1446 /**
1447 * @brief Copy a sequence, rotating its elements.
1448 * @ingroup mutating_algorithms
1449 * @param __first A forward iterator.
1450 * @param __middle A forward iterator.
1451 * @param __last A forward iterator.
1452 * @param __result An output iterator.
1453 * @return An iterator designating the end of the resulting sequence.
1454 *
1455 * Copies the elements of the range @p [__first,__last) to the
1456 * range beginning at @result, rotating the copied elements by
1457 * @p (__middle-__first) positions so that the element at @p __middle
1458 * is moved to @p __result, the element at @p __middle+1 is moved
1459 * to @p __result+1 and so on for each element in the range @p
1460 * [__first,__last).
1461 *
1462 * Performs
1463 * @p *(__result+(n+(__last-__middle))%(__last-__first))=*(__first+n)
1464 * for each @p n in the range @p [0,__last-__first).
1465 */
1466 template<typename _ForwardIterator, typename _OutputIterator>
1467 inline _OutputIterator
1468 rotate_copy(_ForwardIterator __first, _ForwardIterator __middle,
1469 _ForwardIterator __last, _OutputIterator __result)
1470 {
1471 // concept requirements
1472 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
1473 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
1474 typename iterator_traits<_ForwardIterator>::value_type>)
1475 __glibcxx_requires_valid_range(__first, __middle);
1476 __glibcxx_requires_valid_range(__middle, __last);
1477
1478 return std::copy(__first, __middle,
1479 std::copy(__middle, __last, __result));
1480 }
1481
1482 /// This is a helper function...
1483 template<typename _ForwardIterator, typename _Predicate>
1484 _ForwardIterator
1485 __partition(_ForwardIterator __first, _ForwardIterator __last,
1486 _Predicate __pred, forward_iterator_tag)
1487 {
1488 if (__first == __last)
1489 return __first;
1490
1491 while (__pred(*__first))
1492 if (++__first == __last)
1493 return __first;
1494
1495 _ForwardIterator __next = __first;
1496
1497 while (++__next != __last)
1498 if (__pred(*__next))
1499 {
1500 std::iter_swap(__first, __next);
1501 ++__first;
1502 }
1503
1504 return __first;
1505 }
1506
1507 /// This is a helper function...
1508 template<typename _BidirectionalIterator, typename _Predicate>
1509 _BidirectionalIterator
1510 __partition(_BidirectionalIterator __first, _BidirectionalIterator __last,
1511 _Predicate __pred, bidirectional_iterator_tag)
1512 {
1513 while (true)
1514 {
1515 while (true)
1516 if (__first == __last)
1517 return __first;
1518 else if (__pred(*__first))
1519 ++__first;
1520 else
1521 break;
1522 --__last;
1523 while (true)
1524 if (__first == __last)
1525 return __first;
1526 else if (!bool(__pred(*__last)))
1527 --__last;
1528 else
1529 break;
1530 std::iter_swap(__first, __last);
1531 ++__first;
1532 }
1533 }
1534
1535 // partition
1536
1537 /// This is a helper function...
1538 /// Requires __first != __last and !__pred(__first)
1539 /// and __len == distance(__first, __last).
1540 ///
1541 /// !__pred(__first) allows us to guarantee that we don't
1542 /// move-assign an element onto itself.
1543 template<typename _ForwardIterator, typename _Pointer, typename _Predicate,
1544 typename _Distance>
1545 _ForwardIterator
1546 __stable_partition_adaptive(_ForwardIterator __first,
1547 _ForwardIterator __last,
1548 _Predicate __pred, _Distance __len,
1549 _Pointer __buffer,
1550 _Distance __buffer_size)
1551 {
1552 if (__len == 1)
1553 return __first;
1554
1555 if (__len <= __buffer_size)
1556 {
1557 _ForwardIterator __result1 = __first;
1558 _Pointer __result2 = __buffer;
1559
1560 // The precondition guarantees that !__pred(__first), so
1561 // move that element to the buffer before starting the loop.
1562 // This ensures that we only call __pred once per element.
1563 *__result2 = _GLIBCXX_MOVE(*__first)std::move(*__first);
1564 ++__result2;
1565 ++__first;
1566 for (; __first != __last; ++__first)
1567 if (__pred(__first))
1568 {
1569 *__result1 = _GLIBCXX_MOVE(*__first)std::move(*__first);
1570 ++__result1;
1571 }
1572 else
1573 {
1574 *__result2 = _GLIBCXX_MOVE(*__first)std::move(*__first);
1575 ++__result2;
1576 }
1577
1578 _GLIBCXX_MOVE3(__buffer, __result2, __result1)std::move(__buffer, __result2, __result1);
1579 return __result1;
1580 }
1581
1582 _ForwardIterator __middle = __first;
1583 std::advance(__middle, __len / 2);
1584 _ForwardIterator __left_split =
1585 std::__stable_partition_adaptive(__first, __middle, __pred,
1586 __len / 2, __buffer,
1587 __buffer_size);
1588
1589 // Advance past true-predicate values to satisfy this
1590 // function's preconditions.
1591 _Distance __right_len = __len - __len / 2;
1592 _ForwardIterator __right_split =
1593 std::__find_if_not_n(__middle, __right_len, __pred);
1594
1595 if (__right_len)
1596 __right_split =
1597 std::__stable_partition_adaptive(__right_split, __last, __pred,
1598 __right_len,
1599 __buffer, __buffer_size);
1600
1601 std::rotate(__left_split, __middle, __right_split);
1602 std::advance(__left_split, std::distance(__middle, __right_split));
1603 return __left_split;
1604 }
1605
1606 template<typename _ForwardIterator, typename _Predicate>
1607 _ForwardIterator
1608 __stable_partition(_ForwardIterator __first, _ForwardIterator __last,
1609 _Predicate __pred)
1610 {
1611 __first = std::__find_if_not(__first, __last, __pred);
1612
1613 if (__first == __last)
1614 return __first;
1615
1616 typedef typename iterator_traits<_ForwardIterator>::value_type
1617 _ValueType;
1618 typedef typename iterator_traits<_ForwardIterator>::difference_type
1619 _DistanceType;
1620
1621 _Temporary_buffer<_ForwardIterator, _ValueType> __buf(__first, __last);
1622 return
1623 std::__stable_partition_adaptive(__first, __last, __pred,
1624 _DistanceType(__buf.requested_size()),
1625 __buf.begin(),
1626 _DistanceType(__buf.size()));
1627 }
1628
1629 /**
1630 * @brief Move elements for which a predicate is true to the beginning
1631 * of a sequence, preserving relative ordering.
1632 * @ingroup mutating_algorithms
1633 * @param __first A forward iterator.
1634 * @param __last A forward iterator.
1635 * @param __pred A predicate functor.
1636 * @return An iterator @p middle such that @p __pred(i) is true for each
1637 * iterator @p i in the range @p [first,middle) and false for each @p i
1638 * in the range @p [middle,last).
1639 *
1640 * Performs the same function as @p partition() with the additional
1641 * guarantee that the relative ordering of elements in each group is
1642 * preserved, so any two elements @p x and @p y in the range
1643 * @p [__first,__last) such that @p __pred(x)==__pred(y) will have the same
1644 * relative ordering after calling @p stable_partition().
1645 */
1646 template<typename _ForwardIterator, typename _Predicate>
1647 inline _ForwardIterator
1648 stable_partition(_ForwardIterator __first, _ForwardIterator __last,
1649 _Predicate __pred)
1650 {
1651 // concept requirements
1652 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
1653 _ForwardIterator>)
1654 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
1655 typename iterator_traits<_ForwardIterator>::value_type>)
1656 __glibcxx_requires_valid_range(__first, __last);
1657
1658 return std::__stable_partition(__first, __last,
1659 __gnu_cxx::__ops::__pred_iter(__pred));
1660 }
1661
1662 /// This is a helper function for the sort routines.
1663 template<typename _RandomAccessIterator, typename _Compare>
1664 void
1665 __heap_select(_RandomAccessIterator __first,
1666 _RandomAccessIterator __middle,
1667 _RandomAccessIterator __last, _Compare __comp)
1668 {
1669 std::__make_heap(__first, __middle, __comp);
1670 for (_RandomAccessIterator __i = __middle; __i < __last; ++__i)
1671 if (__comp(__i, __first))
1672 std::__pop_heap(__first, __middle, __i, __comp);
1673 }
1674
1675 // partial_sort
1676
1677 template<typename _InputIterator, typename _RandomAccessIterator,
1678 typename _Compare>
1679 _RandomAccessIterator
1680 __partial_sort_copy(_InputIterator __first, _InputIterator __last,
1681 _RandomAccessIterator __result_first,
1682 _RandomAccessIterator __result_last,
1683 _Compare __comp)
1684 {
1685 typedef typename iterator_traits<_InputIterator>::value_type
1686 _InputValueType;
1687 typedef iterator_traits<_RandomAccessIterator> _RItTraits;
1688 typedef typename _RItTraits::difference_type _DistanceType;
1689
1690 if (__result_first == __result_last)
1691 return __result_last;
1692 _RandomAccessIterator __result_real_last = __result_first;
1693 while (__first != __last && __result_real_last != __result_last)
1694 {
1695 *__result_real_last = *__first;
1696 ++__result_real_last;
1697 ++__first;
1698 }
1699
1700 std::__make_heap(__result_first, __result_real_last, __comp);
1701 while (__first != __last)
1702 {
1703 if (__comp(__first, __result_first))
1704 std::__adjust_heap(__result_first, _DistanceType(0),
1705 _DistanceType(__result_real_last
1706 - __result_first),
1707 _InputValueType(*__first), __comp);
1708 ++__first;
1709 }
1710 std::__sort_heap(__result_first, __result_real_last, __comp);
1711 return __result_real_last;
1712 }
1713
1714 /**
1715 * @brief Copy the smallest elements of a sequence.
1716 * @ingroup sorting_algorithms
1717 * @param __first An iterator.
1718 * @param __last Another iterator.
1719 * @param __result_first A random-access iterator.
1720 * @param __result_last Another random-access iterator.
1721 * @return An iterator indicating the end of the resulting sequence.
1722 *
1723 * Copies and sorts the smallest N values from the range @p [__first,__last)
1724 * to the range beginning at @p __result_first, where the number of
1725 * elements to be copied, @p N, is the smaller of @p (__last-__first) and
1726 * @p (__result_last-__result_first).
1727 * After the sort if @e i and @e j are iterators in the range
1728 * @p [__result_first,__result_first+N) such that i precedes j then
1729 * *j<*i is false.
1730 * The value returned is @p __result_first+N.
1731 */
1732 template<typename _InputIterator, typename _RandomAccessIterator>
1733 inline _RandomAccessIterator
1734 partial_sort_copy(_InputIterator __first, _InputIterator __last,
1735 _RandomAccessIterator __result_first,
1736 _RandomAccessIterator __result_last)
1737 {
1738#ifdef _GLIBCXX_CONCEPT_CHECKS
1739 typedef typename iterator_traits<_InputIterator>::value_type
1740 _InputValueType;
1741 typedef typename iterator_traits<_RandomAccessIterator>::value_type
1742 _OutputValueType;
1743#endif
1744
1745 // concept requirements
1746 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
1747 __glibcxx_function_requires(_ConvertibleConcept<_InputValueType,
1748 _OutputValueType>)
1749 __glibcxx_function_requires(_LessThanOpConcept<_InputValueType,
1750 _OutputValueType>)
1751 __glibcxx_function_requires(_LessThanComparableConcept<_OutputValueType>)
1752 __glibcxx_requires_valid_range(__first, __last);
1753 __glibcxx_requires_irreflexive(__first, __last);
1754 __glibcxx_requires_valid_range(__result_first, __result_last);
1755
1756 return std::__partial_sort_copy(__first, __last,
1757 __result_first, __result_last,
1758 __gnu_cxx::__ops::__iter_less_iter());
1759 }
1760
1761 /**
1762 * @brief Copy the smallest elements of a sequence using a predicate for
1763 * comparison.
1764 * @ingroup sorting_algorithms
1765 * @param __first An input iterator.
1766 * @param __last Another input iterator.
1767 * @param __result_first A random-access iterator.
1768 * @param __result_last Another random-access iterator.
1769 * @param __comp A comparison functor.
1770 * @return An iterator indicating the end of the resulting sequence.
1771 *
1772 * Copies and sorts the smallest N values from the range @p [__first,__last)
1773 * to the range beginning at @p result_first, where the number of
1774 * elements to be copied, @p N, is the smaller of @p (__last-__first) and
1775 * @p (__result_last-__result_first).
1776 * After the sort if @e i and @e j are iterators in the range
1777 * @p [__result_first,__result_first+N) such that i precedes j then
1778 * @p __comp(*j,*i) is false.
1779 * The value returned is @p __result_first+N.
1780 */
1781 template<typename _InputIterator, typename _RandomAccessIterator,
1782 typename _Compare>
1783 inline _RandomAccessIterator
1784 partial_sort_copy(_InputIterator __first, _InputIterator __last,
1785 _RandomAccessIterator __result_first,
1786 _RandomAccessIterator __result_last,
1787 _Compare __comp)
1788 {
1789#ifdef _GLIBCXX_CONCEPT_CHECKS
1790 typedef typename iterator_traits<_InputIterator>::value_type
1791 _InputValueType;
1792 typedef typename iterator_traits<_RandomAccessIterator>::value_type
1793 _OutputValueType;
1794#endif
1795
1796 // concept requirements
1797 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
1798 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
1799 _RandomAccessIterator>)
1800 __glibcxx_function_requires(_ConvertibleConcept<_InputValueType,
1801 _OutputValueType>)
1802 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
1803 _InputValueType, _OutputValueType>)
1804 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
1805 _OutputValueType, _OutputValueType>)
1806 __glibcxx_requires_valid_range(__first, __last);
1807 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
1808 __glibcxx_requires_valid_range(__result_first, __result_last);
1809
1810 return std::__partial_sort_copy(__first, __last,
1811 __result_first, __result_last,
1812 __gnu_cxx::__ops::__iter_comp_iter(__comp));
1813 }
1814
1815 /// This is a helper function for the sort routine.
1816 template<typename _RandomAccessIterator, typename _Compare>
1817 void
1818 __unguarded_linear_insert(_RandomAccessIterator __last,
1819 _Compare __comp)
1820 {
1821 typename iterator_traits<_RandomAccessIterator>::value_type
1822 __val = _GLIBCXX_MOVE(*__last)std::move(*__last);
1823 _RandomAccessIterator __next = __last;
1824 --__next;
1825 while (__comp(__val, __next))
1826 {
1827 *__last = _GLIBCXX_MOVE(*__next)std::move(*__next);
1828 __last = __next;
1829 --__next;
1830 }
1831 *__last = _GLIBCXX_MOVE(__val)std::move(__val);
1832 }
1833
1834 /// This is a helper function for the sort routine.
1835 template<typename _RandomAccessIterator, typename _Compare>
1836 void
1837 __insertion_sort(_RandomAccessIterator __first,
1838 _RandomAccessIterator __last, _Compare __comp)
1839 {
1840 if (__first == __last) return;
1841
1842 for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)
1843 {
1844 if (__comp(__i, __first))
1845 {
1846 typename iterator_traits<_RandomAccessIterator>::value_type
1847 __val = _GLIBCXX_MOVE(*__i)std::move(*__i);
1848 _GLIBCXX_MOVE_BACKWARD3(__first, __i, __i + 1)std::move_backward(__first, __i, __i + 1);
1849 *__first = _GLIBCXX_MOVE(__val)std::move(__val);
1850 }
1851 else
1852 std::__unguarded_linear_insert(__i,
1853 __gnu_cxx::__ops::__val_comp_iter(__comp));
1854 }
1855 }
1856
1857 /// This is a helper function for the sort routine.
1858 template<typename _RandomAccessIterator, typename _Compare>
1859 inline void
1860 __unguarded_insertion_sort(_RandomAccessIterator __first,
1861 _RandomAccessIterator __last, _Compare __comp)
1862 {
1863 for (_RandomAccessIterator __i = __first; __i != __last; ++__i)
1864 std::__unguarded_linear_insert(__i,
1865 __gnu_cxx::__ops::__val_comp_iter(__comp));
1866 }
1867
1868 /**
1869 * @doctodo
1870 * This controls some aspect of the sort routines.
1871 */
1872 enum { _S_threshold = 16 };
1873
1874 /// This is a helper function for the sort routine.
1875 template<typename _RandomAccessIterator, typename _Compare>
1876 void
1877 __final_insertion_sort(_RandomAccessIterator __first,
1878 _RandomAccessIterator __last, _Compare __comp)
1879 {
1880 if (__last - __first > int(_S_threshold))
1881 {
1882 std::__insertion_sort(__first, __first + int(_S_threshold), __comp);
1883 std::__unguarded_insertion_sort(__first + int(_S_threshold), __last,
1884 __comp);
1885 }
1886 else
1887 std::__insertion_sort(__first, __last, __comp);
1888 }
1889
1890 /// This is a helper function...
1891 template<typename _RandomAccessIterator, typename _Compare>
1892 _RandomAccessIterator
1893 __unguarded_partition(_RandomAccessIterator __first,
1894 _RandomAccessIterator __last,
1895 _RandomAccessIterator __pivot, _Compare __comp)
1896 {
1897 while (true)
1898 {
1899 while (__comp(__first, __pivot))
1900 ++__first;
1901 --__last;
1902 while (__comp(__pivot, __last))
1903 --__last;
1904 if (!(__first < __last))
1905 return __first;
1906 std::iter_swap(__first, __last);
1907 ++__first;
1908 }
1909 }
1910
1911 /// This is a helper function...
1912 template<typename _RandomAccessIterator, typename _Compare>
1913 inline _RandomAccessIterator
1914 __unguarded_partition_pivot(_RandomAccessIterator __first,
1915 _RandomAccessIterator __last, _Compare __comp)
1916 {
1917 _RandomAccessIterator __mid = __first + (__last - __first) / 2;
1918 std::__move_median_to_first(__first, __first + 1, __mid, __last - 1,
1919 __comp);
1920 return std::__unguarded_partition(__first + 1, __last, __first, __comp);
1921 }
1922
1923 template<typename _RandomAccessIterator, typename _Compare>
1924 inline void
1925 __partial_sort(_RandomAccessIterator __first,
1926 _RandomAccessIterator __middle,
1927 _RandomAccessIterator __last,
1928 _Compare __comp)
1929 {
1930 std::__heap_select(__first, __middle, __last, __comp);
1931 std::__sort_heap(__first, __middle, __comp);
1932 }
1933
1934 /// This is a helper function for the sort routine.
1935 template<typename _RandomAccessIterator, typename _Size, typename _Compare>
1936 void
1937 __introsort_loop(_RandomAccessIterator __first,
1938 _RandomAccessIterator __last,
1939 _Size __depth_limit, _Compare __comp)
1940 {
1941 while (__last - __first > int(_S_threshold))
1942 {
1943 if (__depth_limit == 0)
1944 {
1945 std::__partial_sort(__first, __last, __last, __comp);
1946 return;
1947 }
1948 --__depth_limit;
1949 _RandomAccessIterator __cut =
1950 std::__unguarded_partition_pivot(__first, __last, __comp);
1951 std::__introsort_loop(__cut, __last, __depth_limit, __comp);
1952 __last = __cut;
1953 }
1954 }
1955
1956 // sort
1957
1958 template<typename _RandomAccessIterator, typename _Compare>
1959 inline void
1960 __sort(_RandomAccessIterator __first, _RandomAccessIterator __last,
1961 _Compare __comp)
1962 {
1963 if (__first != __last)
1964 {
1965 std::__introsort_loop(__first, __last,
1966 std::__lg(__last - __first) * 2,
1967 __comp);
1968 std::__final_insertion_sort(__first, __last, __comp);
1969 }
1970 }
1971
1972 template<typename _RandomAccessIterator, typename _Size, typename _Compare>
1973 void
1974 __introselect(_RandomAccessIterator __first, _RandomAccessIterator __nth,
1975 _RandomAccessIterator __last, _Size __depth_limit,
1976 _Compare __comp)
1977 {
1978 while (__last - __first > 3)
1979 {
1980 if (__depth_limit == 0)
1981 {
1982 std::__heap_select(__first, __nth + 1, __last, __comp);
1983 // Place the nth largest element in its final position.
1984 std::iter_swap(__first, __nth);
1985 return;
1986 }
1987 --__depth_limit;
1988 _RandomAccessIterator __cut =
1989 std::__unguarded_partition_pivot(__first, __last, __comp);
1990 if (__cut <= __nth)
1991 __first = __cut;
1992 else
1993 __last = __cut;
1994 }
1995 std::__insertion_sort(__first, __last, __comp);
1996 }
1997
1998 // nth_element
1999
2000 // lower_bound moved to stl_algobase.h
2001
2002 /**
2003 * @brief Finds the first position in which @p __val could be inserted
2004 * without changing the ordering.
2005 * @ingroup binary_search_algorithms
2006 * @param __first An iterator.
2007 * @param __last Another iterator.
2008 * @param __val The search term.
2009 * @param __comp A functor to use for comparisons.
2010 * @return An iterator pointing to the first element <em>not less
2011 * than</em> @p __val, or end() if every element is less
2012 * than @p __val.
2013 * @ingroup binary_search_algorithms
2014 *
2015 * The comparison function should have the same effects on ordering as
2016 * the function used for the initial sort.
2017 */
2018 template<typename _ForwardIterator, typename _Tp, typename _Compare>
2019 inline _ForwardIterator
2020 lower_bound(_ForwardIterator __first, _ForwardIterator __last,
2021 const _Tp& __val, _Compare __comp)
2022 {
2023 // concept requirements
2024 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2025 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2026 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
2027 __glibcxx_requires_partitioned_lower_pred(__first, __last,
2028 __val, __comp);
2029
2030 return std::__lower_bound(__first, __last, __val,
2031 __gnu_cxx::__ops::__iter_comp_val(__comp));
2032 }
2033
2034 template<typename _ForwardIterator, typename _Tp, typename _Compare>
2035 _ForwardIterator
2036 __upper_bound(_ForwardIterator __first, _ForwardIterator __last,
2037 const _Tp& __val, _Compare __comp)
2038 {
2039 typedef typename iterator_traits<_ForwardIterator>::difference_type
2040 _DistanceType;
2041
2042 _DistanceType __len = std::distance(__first, __last);
2043
2044 while (__len > 0)
2045 {
2046 _DistanceType __half = __len >> 1;
2047 _ForwardIterator __middle = __first;
2048 std::advance(__middle, __half);
2049 if (__comp(__val, __middle))
2050 __len = __half;
2051 else
2052 {
2053 __first = __middle;
2054 ++__first;
2055 __len = __len - __half - 1;
2056 }
2057 }
2058 return __first;
2059 }
2060
2061 /**
2062 * @brief Finds the last position in which @p __val could be inserted
2063 * without changing the ordering.
2064 * @ingroup binary_search_algorithms
2065 * @param __first An iterator.
2066 * @param __last Another iterator.
2067 * @param __val The search term.
2068 * @return An iterator pointing to the first element greater than @p __val,
2069 * or end() if no elements are greater than @p __val.
2070 * @ingroup binary_search_algorithms
2071 */
2072 template<typename _ForwardIterator, typename _Tp>
2073 inline _ForwardIterator
2074 upper_bound(_ForwardIterator __first, _ForwardIterator __last,
2075 const _Tp& __val)
2076 {
2077 // concept requirements
2078 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2079 __glibcxx_function_requires(_LessThanOpConcept<
2080 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2081 __glibcxx_requires_partitioned_upper(__first, __last, __val);
2082
2083 return std::__upper_bound(__first, __last, __val,
2084 __gnu_cxx::__ops::__val_less_iter());
2085 }
2086
2087 /**
2088 * @brief Finds the last position in which @p __val could be inserted
2089 * without changing the ordering.
2090 * @ingroup binary_search_algorithms
2091 * @param __first An iterator.
2092 * @param __last Another iterator.
2093 * @param __val The search term.
2094 * @param __comp A functor to use for comparisons.
2095 * @return An iterator pointing to the first element greater than @p __val,
2096 * or end() if no elements are greater than @p __val.
2097 * @ingroup binary_search_algorithms
2098 *
2099 * The comparison function should have the same effects on ordering as
2100 * the function used for the initial sort.
2101 */
2102 template<typename _ForwardIterator, typename _Tp, typename _Compare>
2103 inline _ForwardIterator
2104 upper_bound(_ForwardIterator __first, _ForwardIterator __last,
2105 const _Tp& __val, _Compare __comp)
2106 {
2107 // concept requirements
2108 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2109 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2110 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2111 __glibcxx_requires_partitioned_upper_pred(__first, __last,
2112 __val, __comp);
2113
2114 return std::__upper_bound(__first, __last, __val,
2115 __gnu_cxx::__ops::__val_comp_iter(__comp));
2116 }
2117
2118 template<typename _ForwardIterator, typename _Tp,
2119 typename _CompareItTp, typename _CompareTpIt>
2120 pair<_ForwardIterator, _ForwardIterator>
2121 __equal_range(_ForwardIterator __first, _ForwardIterator __last,
2122 const _Tp& __val,
2123 _CompareItTp __comp_it_val, _CompareTpIt __comp_val_it)
2124 {
2125 typedef typename iterator_traits<_ForwardIterator>::difference_type
2126 _DistanceType;
2127
2128 _DistanceType __len = std::distance(__first, __last);
2129
2130 while (__len > 0)
2131 {
2132 _DistanceType __half = __len >> 1;
2133 _ForwardIterator __middle = __first;
2134 std::advance(__middle, __half);
2135 if (__comp_it_val(__middle, __val))
2136 {
2137 __first = __middle;
2138 ++__first;
2139 __len = __len - __half - 1;
2140 }
2141 else if (__comp_val_it(__val, __middle))
2142 __len = __half;
2143 else
2144 {
2145 _ForwardIterator __left
2146 = std::__lower_bound(__first, __middle, __val, __comp_it_val);
2147 std::advance(__first, __len);
2148 _ForwardIterator __right
2149 = std::__upper_bound(++__middle, __first, __val, __comp_val_it);
2150 return pair<_ForwardIterator, _ForwardIterator>(__left, __right);
2151 }
2152 }
2153 return pair<_ForwardIterator, _ForwardIterator>(__first, __first);
2154 }
2155
2156 /**
2157 * @brief Finds the largest subrange in which @p __val could be inserted
2158 * at any place in it without changing the ordering.
2159 * @ingroup binary_search_algorithms
2160 * @param __first An iterator.
2161 * @param __last Another iterator.
2162 * @param __val The search term.
2163 * @return An pair of iterators defining the subrange.
2164 * @ingroup binary_search_algorithms
2165 *
2166 * This is equivalent to
2167 * @code
2168 * std::make_pair(lower_bound(__first, __last, __val),
2169 * upper_bound(__first, __last, __val))
2170 * @endcode
2171 * but does not actually call those functions.
2172 */
2173 template<typename _ForwardIterator, typename _Tp>
2174 inline pair<_ForwardIterator, _ForwardIterator>
2175 equal_range(_ForwardIterator __first, _ForwardIterator __last,
2176 const _Tp& __val)
2177 {
2178 // concept requirements
2179 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2180 __glibcxx_function_requires(_LessThanOpConcept<
2181 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
2182 __glibcxx_function_requires(_LessThanOpConcept<
2183 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2184 __glibcxx_requires_partitioned_lower(__first, __last, __val);
2185 __glibcxx_requires_partitioned_upper(__first, __last, __val);
2186
2187 return std::__equal_range(__first, __last, __val,
2188 __gnu_cxx::__ops::__iter_less_val(),
2189 __gnu_cxx::__ops::__val_less_iter());
2190 }
2191
2192 /**
2193 * @brief Finds the largest subrange in which @p __val could be inserted
2194 * at any place in it without changing the ordering.
2195 * @param __first An iterator.
2196 * @param __last Another iterator.
2197 * @param __val The search term.
2198 * @param __comp A functor to use for comparisons.
2199 * @return An pair of iterators defining the subrange.
2200 * @ingroup binary_search_algorithms
2201 *
2202 * This is equivalent to
2203 * @code
2204 * std::make_pair(lower_bound(__first, __last, __val, __comp),
2205 * upper_bound(__first, __last, __val, __comp))
2206 * @endcode
2207 * but does not actually call those functions.
2208 */
2209 template<typename _ForwardIterator, typename _Tp, typename _Compare>
2210 inline pair<_ForwardIterator, _ForwardIterator>
2211 equal_range(_ForwardIterator __first, _ForwardIterator __last,
2212 const _Tp& __val, _Compare __comp)
2213 {
2214 // concept requirements
2215 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2216 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2217 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
2218 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2219 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2220 __glibcxx_requires_partitioned_lower_pred(__first, __last,
2221 __val, __comp);
2222 __glibcxx_requires_partitioned_upper_pred(__first, __last,
2223 __val, __comp);
2224
2225 return std::__equal_range(__first, __last, __val,
2226 __gnu_cxx::__ops::__iter_comp_val(__comp),
2227 __gnu_cxx::__ops::__val_comp_iter(__comp));
2228 }
2229
2230 /**
2231 * @brief Determines whether an element exists in a range.
2232 * @ingroup binary_search_algorithms
2233 * @param __first An iterator.
2234 * @param __last Another iterator.
2235 * @param __val The search term.
2236 * @return True if @p __val (or its equivalent) is in [@p
2237 * __first,@p __last ].
2238 *
2239 * Note that this does not actually return an iterator to @p __val. For
2240 * that, use std::find or a container's specialized find member functions.
2241 */
2242 template<typename _ForwardIterator, typename _Tp>
2243 bool
2244 binary_search(_ForwardIterator __first, _ForwardIterator __last,
2245 const _Tp& __val)
2246 {
2247 // concept requirements
2248 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2249 __glibcxx_function_requires(_LessThanOpConcept<
2250 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2251 __glibcxx_requires_partitioned_lower(__first, __last, __val);
2252 __glibcxx_requires_partitioned_upper(__first, __last, __val);
2253
2254 _ForwardIterator __i
2255 = std::__lower_bound(__first, __last, __val,
2256 __gnu_cxx::__ops::__iter_less_val());
2257 return __i != __last && !(__val < *__i);
2258 }
2259
2260 /**
2261 * @brief Determines whether an element exists in a range.
2262 * @ingroup binary_search_algorithms
2263 * @param __first An iterator.
2264 * @param __last Another iterator.
2265 * @param __val The search term.
2266 * @param __comp A functor to use for comparisons.
2267 * @return True if @p __val (or its equivalent) is in @p [__first,__last].
2268 *
2269 * Note that this does not actually return an iterator to @p __val. For
2270 * that, use std::find or a container's specialized find member functions.
2271 *
2272 * The comparison function should have the same effects on ordering as
2273 * the function used for the initial sort.
2274 */
2275 template<typename _ForwardIterator, typename _Tp, typename _Compare>
2276 bool
2277 binary_search(_ForwardIterator __first, _ForwardIterator __last,
2278 const _Tp& __val, _Compare __comp)
2279 {
2280 // concept requirements
2281 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
2282 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2283 _Tp, typename iterator_traits<_ForwardIterator>::value_type>)
2284 __glibcxx_requires_partitioned_lower_pred(__first, __last,
2285 __val, __comp);
2286 __glibcxx_requires_partitioned_upper_pred(__first, __last,
2287 __val, __comp);
2288
2289 _ForwardIterator __i
2290 = std::__lower_bound(__first, __last, __val,
2291 __gnu_cxx::__ops::__iter_comp_val(__comp));
2292 return __i != __last && !bool(__comp(__val, *__i));
2293 }
2294
2295 // merge
2296
2297 /// This is a helper function for the __merge_adaptive routines.
2298 template<typename _InputIterator1, typename _InputIterator2,
2299 typename _OutputIterator, typename _Compare>
2300 void
2301 __move_merge_adaptive(_InputIterator1 __first1, _InputIterator1 __last1,
2302 _InputIterator2 __first2, _InputIterator2 __last2,
2303 _OutputIterator __result, _Compare __comp)
2304 {
2305 while (__first1 != __last1 && __first2 != __last2)
2306 {
2307 if (__comp(__first2, __first1))
2308 {
2309 *__result = _GLIBCXX_MOVE(*__first2)std::move(*__first2);
2310 ++__first2;
2311 }
2312 else
2313 {
2314 *__result = _GLIBCXX_MOVE(*__first1)std::move(*__first1);
2315 ++__first1;
2316 }
2317 ++__result;
2318 }
2319 if (__first1 != __last1)
2320 _GLIBCXX_MOVE3(__first1, __last1, __result)std::move(__first1, __last1, __result);
2321 }
2322
2323 /// This is a helper function for the __merge_adaptive routines.
2324 template<typename _BidirectionalIterator1, typename _BidirectionalIterator2,
2325 typename _BidirectionalIterator3, typename _Compare>
2326 void
2327 __move_merge_adaptive_backward(_BidirectionalIterator1 __first1,
2328 _BidirectionalIterator1 __last1,
2329 _BidirectionalIterator2 __first2,
2330 _BidirectionalIterator2 __last2,
2331 _BidirectionalIterator3 __result,
2332 _Compare __comp)
2333 {
2334 if (__first1 == __last1)
2335 {
2336 _GLIBCXX_MOVE_BACKWARD3(__first2, __last2, __result)std::move_backward(__first2, __last2, __result);
2337 return;
2338 }
2339 else if (__first2 == __last2)
2340 return;
2341
2342 --__last1;
2343 --__last2;
2344 while (true)
2345 {
2346 if (__comp(__last2, __last1))
2347 {
2348 *--__result = _GLIBCXX_MOVE(*__last1)std::move(*__last1);
2349 if (__first1 == __last1)
2350 {
2351 _GLIBCXX_MOVE_BACKWARD3(__first2, ++__last2, __result)std::move_backward(__first2, ++__last2, __result);
2352 return;
2353 }
2354 --__last1;
2355 }
2356 else
2357 {
2358 *--__result = _GLIBCXX_MOVE(*__last2)std::move(*__last2);
2359 if (__first2 == __last2)
2360 return;
2361 --__last2;
2362 }
2363 }
2364 }
2365
2366 /// This is a helper function for the merge routines.
2367 template<typename _BidirectionalIterator1, typename _BidirectionalIterator2,
2368 typename _Distance>
2369 _BidirectionalIterator1
2370 __rotate_adaptive(_BidirectionalIterator1 __first,
2371 _BidirectionalIterator1 __middle,
2372 _BidirectionalIterator1 __last,
2373 _Distance __len1, _Distance __len2,
2374 _BidirectionalIterator2 __buffer,
2375 _Distance __buffer_size)
2376 {
2377 _BidirectionalIterator2 __buffer_end;
2378 if (__len1 > __len2 && __len2 <= __buffer_size)
2379 {
2380 if (__len2)
2381 {
2382 __buffer_end = _GLIBCXX_MOVE3(__middle, __last, __buffer)std::move(__middle, __last, __buffer);
2383 _GLIBCXX_MOVE_BACKWARD3(__first, __middle, __last)std::move_backward(__first, __middle, __last);
2384 return _GLIBCXX_MOVE3(__buffer, __buffer_end, __first)std::move(__buffer, __buffer_end, __first);
2385 }
2386 else
2387 return __first;
2388 }
2389 else if (__len1 <= __buffer_size)
2390 {
2391 if (__len1)
2392 {
2393 __buffer_end = _GLIBCXX_MOVE3(__first, __middle, __buffer)std::move(__first, __middle, __buffer);
2394 _GLIBCXX_MOVE3(__middle, __last, __first)std::move(__middle, __last, __first);
2395 return _GLIBCXX_MOVE_BACKWARD3(__buffer, __buffer_end, __last)std::move_backward(__buffer, __buffer_end, __last);
2396 }
2397 else
2398 return __last;
2399 }
2400 else
2401 {
2402 std::rotate(__first, __middle, __last);
2403 std::advance(__first, std::distance(__middle, __last));
2404 return __first;
2405 }
2406 }
2407
2408 /// This is a helper function for the merge routines.
2409 template<typename _BidirectionalIterator, typename _Distance,
2410 typename _Pointer, typename _Compare>
2411 void
2412 __merge_adaptive(_BidirectionalIterator __first,
2413 _BidirectionalIterator __middle,
2414 _BidirectionalIterator __last,
2415 _Distance __len1, _Distance __len2,
2416 _Pointer __buffer, _Distance __buffer_size,
2417 _Compare __comp)
2418 {
2419 if (__len1 <= __len2 && __len1 <= __buffer_size)
2420 {
2421 _Pointer __buffer_end = _GLIBCXX_MOVE3(__first, __middle, __buffer)std::move(__first, __middle, __buffer);
2422 std::__move_merge_adaptive(__buffer, __buffer_end, __middle, __last,
2423 __first, __comp);
2424 }
2425 else if (__len2 <= __buffer_size)
2426 {
2427 _Pointer __buffer_end = _GLIBCXX_MOVE3(__middle, __last, __buffer)std::move(__middle, __last, __buffer);
2428 std::__move_merge_adaptive_backward(__first, __middle, __buffer,
2429 __buffer_end, __last, __comp);
2430 }
2431 else
2432 {
2433 _BidirectionalIterator __first_cut = __first;
2434 _BidirectionalIterator __second_cut = __middle;
2435 _Distance __len11 = 0;
2436 _Distance __len22 = 0;
2437 if (__len1 > __len2)
2438 {
2439 __len11 = __len1 / 2;
2440 std::advance(__first_cut, __len11);
2441 __second_cut
2442 = std::__lower_bound(__middle, __last, *__first_cut,
2443 __gnu_cxx::__ops::__iter_comp_val(__comp));
2444 __len22 = std::distance(__middle, __second_cut);
2445 }
2446 else
2447 {
2448 __len22 = __len2 / 2;
2449 std::advance(__second_cut, __len22);
2450 __first_cut
2451 = std::__upper_bound(__first, __middle, *__second_cut,
2452 __gnu_cxx::__ops::__val_comp_iter(__comp));
2453 __len11 = std::distance(__first, __first_cut);
2454 }
2455
2456 _BidirectionalIterator __new_middle
2457 = std::__rotate_adaptive(__first_cut, __middle, __second_cut,
2458 __len1 - __len11, __len22, __buffer,
2459 __buffer_size);
2460 std::__merge_adaptive(__first, __first_cut, __new_middle, __len11,
2461 __len22, __buffer, __buffer_size, __comp);
2462 std::__merge_adaptive(__new_middle, __second_cut, __last,
2463 __len1 - __len11,
2464 __len2 - __len22, __buffer,
2465 __buffer_size, __comp);
2466 }
2467 }
2468
2469 /// This is a helper function for the merge routines.
2470 template<typename _BidirectionalIterator, typename _Distance,
2471 typename _Compare>
2472 void
2473 __merge_without_buffer(_BidirectionalIterator __first,
2474 _BidirectionalIterator __middle,
2475 _BidirectionalIterator __last,
2476 _Distance __len1, _Distance __len2,
2477 _Compare __comp)
2478 {
2479 if (__len1 == 0 || __len2 == 0)
2480 return;
2481
2482 if (__len1 + __len2 == 2)
2483 {
2484 if (__comp(__middle, __first))
2485 std::iter_swap(__first, __middle);
2486 return;
2487 }
2488
2489 _BidirectionalIterator __first_cut = __first;
2490 _BidirectionalIterator __second_cut = __middle;
2491 _Distance __len11 = 0;
2492 _Distance __len22 = 0;
2493 if (__len1 > __len2)
2494 {
2495 __len11 = __len1 / 2;
2496 std::advance(__first_cut, __len11);
2497 __second_cut
2498 = std::__lower_bound(__middle, __last, *__first_cut,
2499 __gnu_cxx::__ops::__iter_comp_val(__comp));
2500 __len22 = std::distance(__middle, __second_cut);
2501 }
2502 else
2503 {
2504 __len22 = __len2 / 2;
2505 std::advance(__second_cut, __len22);
2506 __first_cut
2507 = std::__upper_bound(__first, __middle, *__second_cut,
2508 __gnu_cxx::__ops::__val_comp_iter(__comp));
2509 __len11 = std::distance(__first, __first_cut);
2510 }
2511
2512 std::rotate(__first_cut, __middle, __second_cut);
2513 _BidirectionalIterator __new_middle = __first_cut;
2514 std::advance(__new_middle, std::distance(__middle, __second_cut));
2515 std::__merge_without_buffer(__first, __first_cut, __new_middle,
2516 __len11, __len22, __comp);
2517 std::__merge_without_buffer(__new_middle, __second_cut, __last,
2518 __len1 - __len11, __len2 - __len22, __comp);
2519 }
2520
2521 template<typename _BidirectionalIterator, typename _Compare>
2522 void
2523 __inplace_merge(_BidirectionalIterator __first,
2524 _BidirectionalIterator __middle,
2525 _BidirectionalIterator __last,
2526 _Compare __comp)
2527 {
2528 typedef typename iterator_traits<_BidirectionalIterator>::value_type
2529 _ValueType;
2530 typedef typename iterator_traits<_BidirectionalIterator>::difference_type
2531 _DistanceType;
2532
2533 if (__first == __middle || __middle == __last)
2534 return;
2535
2536 const _DistanceType __len1 = std::distance(__first, __middle);
2537 const _DistanceType __len2 = std::distance(__middle, __last);
2538
2539 typedef _Temporary_buffer<_BidirectionalIterator, _ValueType> _TmpBuf;
2540 _TmpBuf __buf(__first, __last);
2541
2542 if (__buf.begin() == 0)
2543 std::__merge_without_buffer
2544 (__first, __middle, __last, __len1, __len2, __comp);
2545 else
2546 std::__merge_adaptive
2547 (__first, __middle, __last, __len1, __len2, __buf.begin(),
2548 _DistanceType(__buf.size()), __comp);
2549 }
2550
2551 /**
2552 * @brief Merges two sorted ranges in place.
2553 * @ingroup sorting_algorithms
2554 * @param __first An iterator.
2555 * @param __middle Another iterator.
2556 * @param __last Another iterator.
2557 * @return Nothing.
2558 *
2559 * Merges two sorted and consecutive ranges, [__first,__middle) and
2560 * [__middle,__last), and puts the result in [__first,__last). The
2561 * output will be sorted. The sort is @e stable, that is, for
2562 * equivalent elements in the two ranges, elements from the first
2563 * range will always come before elements from the second.
2564 *
2565 * If enough additional memory is available, this takes (__last-__first)-1
2566 * comparisons. Otherwise an NlogN algorithm is used, where N is
2567 * distance(__first,__last).
2568 */
2569 template<typename _BidirectionalIterator>
2570 inline void
2571 inplace_merge(_BidirectionalIterator __first,
2572 _BidirectionalIterator __middle,
2573 _BidirectionalIterator __last)
2574 {
2575 // concept requirements
2576 __glibcxx_function_requires(_Mutable_BidirectionalIteratorConcept<
2577 _BidirectionalIterator>)
2578 __glibcxx_function_requires(_LessThanComparableConcept<
2579 typename iterator_traits<_BidirectionalIterator>::value_type>)
2580 __glibcxx_requires_sorted(__first, __middle);
2581 __glibcxx_requires_sorted(__middle, __last);
2582 __glibcxx_requires_irreflexive(__first, __last);
2583
2584 std::__inplace_merge(__first, __middle, __last,
2585 __gnu_cxx::__ops::__iter_less_iter());
2586 }
2587
2588 /**
2589 * @brief Merges two sorted ranges in place.
2590 * @ingroup sorting_algorithms
2591 * @param __first An iterator.
2592 * @param __middle Another iterator.
2593 * @param __last Another iterator.
2594 * @param __comp A functor to use for comparisons.
2595 * @return Nothing.
2596 *
2597 * Merges two sorted and consecutive ranges, [__first,__middle) and
2598 * [middle,last), and puts the result in [__first,__last). The output will
2599 * be sorted. The sort is @e stable, that is, for equivalent
2600 * elements in the two ranges, elements from the first range will always
2601 * come before elements from the second.
2602 *
2603 * If enough additional memory is available, this takes (__last-__first)-1
2604 * comparisons. Otherwise an NlogN algorithm is used, where N is
2605 * distance(__first,__last).
2606 *
2607 * The comparison function should have the same effects on ordering as
2608 * the function used for the initial sort.
2609 */
2610 template<typename _BidirectionalIterator, typename _Compare>
2611 inline void
2612 inplace_merge(_BidirectionalIterator __first,
2613 _BidirectionalIterator __middle,
2614 _BidirectionalIterator __last,
2615 _Compare __comp)
2616 {
2617 // concept requirements
2618 __glibcxx_function_requires(_Mutable_BidirectionalIteratorConcept<
2619 _BidirectionalIterator>)
2620 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2621 typename iterator_traits<_BidirectionalIterator>::value_type,
2622 typename iterator_traits<_BidirectionalIterator>::value_type>)
2623 __glibcxx_requires_sorted_pred(__first, __middle, __comp);
2624 __glibcxx_requires_sorted_pred(__middle, __last, __comp);
2625 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
2626
2627 std::__inplace_merge(__first, __middle, __last,
2628 __gnu_cxx::__ops::__iter_comp_iter(__comp));
2629 }
2630
2631
2632 /// This is a helper function for the __merge_sort_loop routines.
2633 template<typename _InputIterator, typename _OutputIterator,
2634 typename _Compare>
2635 _OutputIterator
2636 __move_merge(_InputIterator __first1, _InputIterator __last1,
2637 _InputIterator __first2, _InputIterator __last2,
2638 _OutputIterator __result, _Compare __comp)
2639 {
2640 while (__first1 != __last1 && __first2 != __last2)
2641 {
2642 if (__comp(__first2, __first1))
2643 {
2644 *__result = _GLIBCXX_MOVE(*__first2)std::move(*__first2);
2645 ++__first2;
2646 }
2647 else
2648 {
2649 *__result = _GLIBCXX_MOVE(*__first1)std::move(*__first1);
2650 ++__first1;
2651 }
2652 ++__result;
2653 }
2654 return _GLIBCXX_MOVE3(__first2, __last2,std::move(__first2, __last2, std::move(__first1, __last1, __result
))
2655 _GLIBCXX_MOVE3(__first1, __last1,std::move(__first2, __last2, std::move(__first1, __last1, __result
))
2656 __result))std::move(__first2, __last2, std::move(__first1, __last1, __result
))
;
2657 }
2658
2659 template<typename _RandomAccessIterator1, typename _RandomAccessIterator2,
2660 typename _Distance, typename _Compare>
2661 void
2662 __merge_sort_loop(_RandomAccessIterator1 __first,
2663 _RandomAccessIterator1 __last,
2664 _RandomAccessIterator2 __result, _Distance __step_size,
2665 _Compare __comp)
2666 {
2667 const _Distance __two_step = 2 * __step_size;
2668
2669 while (__last - __first >= __two_step)
2670 {
2671 __result = std::__move_merge(__first, __first + __step_size,
2672 __first + __step_size,
2673 __first + __two_step,
2674 __result, __comp);
2675 __first += __two_step;
2676 }
2677 __step_size = std::min(_Distance(__last - __first), __step_size);
2678
2679 std::__move_merge(__first, __first + __step_size,
2680 __first + __step_size, __last, __result, __comp);
2681 }
2682
2683 template<typename _RandomAccessIterator, typename _Distance,
2684 typename _Compare>
2685 void
2686 __chunk_insertion_sort(_RandomAccessIterator __first,
2687 _RandomAccessIterator __last,
2688 _Distance __chunk_size, _Compare __comp)
2689 {
2690 while (__last - __first >= __chunk_size)
2691 {
2692 std::__insertion_sort(__first, __first + __chunk_size, __comp);
2693 __first += __chunk_size;
2694 }
2695 std::__insertion_sort(__first, __last, __comp);
2696 }
2697
2698 enum { _S_chunk_size = 7 };
2699
2700 template<typename _RandomAccessIterator, typename _Pointer, typename _Compare>
2701 void
2702 __merge_sort_with_buffer(_RandomAccessIterator __first,
2703 _RandomAccessIterator __last,
2704 _Pointer __buffer, _Compare __comp)
2705 {
2706 typedef typename iterator_traits<_RandomAccessIterator>::difference_type
2707 _Distance;
2708
2709 const _Distance __len = __last - __first;
2710 const _Pointer __buffer_last = __buffer + __len;
2711
2712 _Distance __step_size = _S_chunk_size;
2713 std::__chunk_insertion_sort(__first, __last, __step_size, __comp);
2714
2715 while (__step_size < __len)
2716 {
2717 std::__merge_sort_loop(__first, __last, __buffer,
2718 __step_size, __comp);
2719 __step_size *= 2;
2720 std::__merge_sort_loop(__buffer, __buffer_last, __first,
2721 __step_size, __comp);
2722 __step_size *= 2;
2723 }
2724 }
2725
2726 template<typename _RandomAccessIterator, typename _Pointer,
2727 typename _Distance, typename _Compare>
2728 void
2729 __stable_sort_adaptive(_RandomAccessIterator __first,
2730 _RandomAccessIterator __last,
2731 _Pointer __buffer, _Distance __buffer_size,
2732 _Compare __comp)
2733 {
2734 const _Distance __len = (__last - __first + 1) / 2;
2735 const _RandomAccessIterator __middle = __first + __len;
2736 if (__len > __buffer_size)
2737 {
2738 std::__stable_sort_adaptive(__first, __middle, __buffer,
2739 __buffer_size, __comp);
2740 std::__stable_sort_adaptive(__middle, __last, __buffer,
2741 __buffer_size, __comp);
2742 }
2743 else
2744 {
2745 std::__merge_sort_with_buffer(__first, __middle, __buffer, __comp);
2746 std::__merge_sort_with_buffer(__middle, __last, __buffer, __comp);
2747 }
2748 std::__merge_adaptive(__first, __middle, __last,
2749 _Distance(__middle - __first),
2750 _Distance(__last - __middle),
2751 __buffer, __buffer_size,
2752 __comp);
2753 }
2754
2755 /// This is a helper function for the stable sorting routines.
2756 template<typename _RandomAccessIterator, typename _Compare>
2757 void
2758 __inplace_stable_sort(_RandomAccessIterator __first,
2759 _RandomAccessIterator __last, _Compare __comp)
2760 {
2761 if (__last - __first < 15)
2762 {
2763 std::__insertion_sort(__first, __last, __comp);
2764 return;
2765 }
2766 _RandomAccessIterator __middle = __first + (__last - __first) / 2;
2767 std::__inplace_stable_sort(__first, __middle, __comp);
2768 std::__inplace_stable_sort(__middle, __last, __comp);
2769 std::__merge_without_buffer(__first, __middle, __last,
2770 __middle - __first,
2771 __last - __middle,
2772 __comp);
2773 }
2774
2775 // stable_sort
2776
2777 // Set algorithms: includes, set_union, set_intersection, set_difference,
2778 // set_symmetric_difference. All of these algorithms have the precondition
2779 // that their input ranges are sorted and the postcondition that their output
2780 // ranges are sorted.
2781
2782 template<typename _InputIterator1, typename _InputIterator2,
2783 typename _Compare>
2784 bool
2785 __includes(_InputIterator1 __first1, _InputIterator1 __last1,
2786 _InputIterator2 __first2, _InputIterator2 __last2,
2787 _Compare __comp)
2788 {
2789 while (__first1 != __last1 && __first2 != __last2)
2790 if (__comp(__first2, __first1))
2791 return false;
2792 else if (__comp(__first1, __first2))
2793 ++__first1;
2794 else
2795 {
2796 ++__first1;
2797 ++__first2;
2798 }
2799
2800 return __first2 == __last2;
2801 }
2802
2803 /**
2804 * @brief Determines whether all elements of a sequence exists in a range.
2805 * @param __first1 Start of search range.
2806 * @param __last1 End of search range.
2807 * @param __first2 Start of sequence
2808 * @param __last2 End of sequence.
2809 * @return True if each element in [__first2,__last2) is contained in order
2810 * within [__first1,__last1). False otherwise.
2811 * @ingroup set_algorithms
2812 *
2813 * This operation expects both [__first1,__last1) and
2814 * [__first2,__last2) to be sorted. Searches for the presence of
2815 * each element in [__first2,__last2) within [__first1,__last1).
2816 * The iterators over each range only move forward, so this is a
2817 * linear algorithm. If an element in [__first2,__last2) is not
2818 * found before the search iterator reaches @p __last2, false is
2819 * returned.
2820 */
2821 template<typename _InputIterator1, typename _InputIterator2>
2822 inline bool
2823 includes(_InputIterator1 __first1, _InputIterator1 __last1,
2824 _InputIterator2 __first2, _InputIterator2 __last2)
2825 {
2826 // concept requirements
2827 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
2828 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
2829 __glibcxx_function_requires(_LessThanOpConcept<
2830 typename iterator_traits<_InputIterator1>::value_type,
2831 typename iterator_traits<_InputIterator2>::value_type>)
2832 __glibcxx_function_requires(_LessThanOpConcept<
2833 typename iterator_traits<_InputIterator2>::value_type,
2834 typename iterator_traits<_InputIterator1>::value_type>)
2835 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
2836 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
2837 __glibcxx_requires_irreflexive2(__first1, __last1);
2838 __glibcxx_requires_irreflexive2(__first2, __last2);
2839
2840 return std::__includes(__first1, __last1, __first2, __last2,
2841 __gnu_cxx::__ops::__iter_less_iter());
2842 }
2843
2844 /**
2845 * @brief Determines whether all elements of a sequence exists in a range
2846 * using comparison.
2847 * @ingroup set_algorithms
2848 * @param __first1 Start of search range.
2849 * @param __last1 End of search range.
2850 * @param __first2 Start of sequence
2851 * @param __last2 End of sequence.
2852 * @param __comp Comparison function to use.
2853 * @return True if each element in [__first2,__last2) is contained
2854 * in order within [__first1,__last1) according to comp. False
2855 * otherwise. @ingroup set_algorithms
2856 *
2857 * This operation expects both [__first1,__last1) and
2858 * [__first2,__last2) to be sorted. Searches for the presence of
2859 * each element in [__first2,__last2) within [__first1,__last1),
2860 * using comp to decide. The iterators over each range only move
2861 * forward, so this is a linear algorithm. If an element in
2862 * [__first2,__last2) is not found before the search iterator
2863 * reaches @p __last2, false is returned.
2864 */
2865 template<typename _InputIterator1, typename _InputIterator2,
2866 typename _Compare>
2867 inline bool
2868 includes(_InputIterator1 __first1, _InputIterator1 __last1,
2869 _InputIterator2 __first2, _InputIterator2 __last2,
2870 _Compare __comp)
2871 {
2872 // concept requirements
2873 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
2874 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
2875 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2876 typename iterator_traits<_InputIterator1>::value_type,
2877 typename iterator_traits<_InputIterator2>::value_type>)
2878 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2879 typename iterator_traits<_InputIterator2>::value_type,
2880 typename iterator_traits<_InputIterator1>::value_type>)
2881 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
2882 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
2883 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
2884 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
2885
2886 return std::__includes(__first1, __last1, __first2, __last2,
2887 __gnu_cxx::__ops::__iter_comp_iter(__comp));
2888 }
2889
2890 // nth_element
2891 // merge
2892 // set_difference
2893 // set_intersection
2894 // set_union
2895 // stable_sort
2896 // set_symmetric_difference
2897 // min_element
2898 // max_element
2899
2900 template<typename _BidirectionalIterator, typename _Compare>
2901 bool
2902 __next_permutation(_BidirectionalIterator __first,
2903 _BidirectionalIterator __last, _Compare __comp)
2904 {
2905 if (__first == __last)
2906 return false;
2907 _BidirectionalIterator __i = __first;
2908 ++__i;
2909 if (__i == __last)
2910 return false;
2911 __i = __last;
2912 --__i;
2913
2914 for(;;)
2915 {
2916 _BidirectionalIterator __ii = __i;
2917 --__i;
2918 if (__comp(__i, __ii))
2919 {
2920 _BidirectionalIterator __j = __last;
2921 while (!__comp(__i, --__j))
2922 {}
2923 std::iter_swap(__i, __j);
2924 std::__reverse(__ii, __last,
2925 std::__iterator_category(__first));
2926 return true;
2927 }
2928 if (__i == __first)
2929 {
2930 std::__reverse(__first, __last,
2931 std::__iterator_category(__first));
2932 return false;
2933 }
2934 }
2935 }
2936
2937 /**
2938 * @brief Permute range into the next @e dictionary ordering.
2939 * @ingroup sorting_algorithms
2940 * @param __first Start of range.
2941 * @param __last End of range.
2942 * @return False if wrapped to first permutation, true otherwise.
2943 *
2944 * Treats all permutations of the range as a set of @e dictionary sorted
2945 * sequences. Permutes the current sequence into the next one of this set.
2946 * Returns true if there are more sequences to generate. If the sequence
2947 * is the largest of the set, the smallest is generated and false returned.
2948 */
2949 template<typename _BidirectionalIterator>
2950 inline bool
2951 next_permutation(_BidirectionalIterator __first,
2952 _BidirectionalIterator __last)
2953 {
2954 // concept requirements
2955 __glibcxx_function_requires(_BidirectionalIteratorConcept<
2956 _BidirectionalIterator>)
2957 __glibcxx_function_requires(_LessThanComparableConcept<
2958 typename iterator_traits<_BidirectionalIterator>::value_type>)
2959 __glibcxx_requires_valid_range(__first, __last);
2960 __glibcxx_requires_irreflexive(__first, __last);
2961
2962 return std::__next_permutation
2963 (__first, __last, __gnu_cxx::__ops::__iter_less_iter());
2964 }
2965
2966 /**
2967 * @brief Permute range into the next @e dictionary ordering using
2968 * comparison functor.
2969 * @ingroup sorting_algorithms
2970 * @param __first Start of range.
2971 * @param __last End of range.
2972 * @param __comp A comparison functor.
2973 * @return False if wrapped to first permutation, true otherwise.
2974 *
2975 * Treats all permutations of the range [__first,__last) as a set of
2976 * @e dictionary sorted sequences ordered by @p __comp. Permutes the current
2977 * sequence into the next one of this set. Returns true if there are more
2978 * sequences to generate. If the sequence is the largest of the set, the
2979 * smallest is generated and false returned.
2980 */
2981 template<typename _BidirectionalIterator, typename _Compare>
2982 inline bool
2983 next_permutation(_BidirectionalIterator __first,
2984 _BidirectionalIterator __last, _Compare __comp)
2985 {
2986 // concept requirements
2987 __glibcxx_function_requires(_BidirectionalIteratorConcept<
2988 _BidirectionalIterator>)
2989 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
2990 typename iterator_traits<_BidirectionalIterator>::value_type,
2991 typename iterator_traits<_BidirectionalIterator>::value_type>)
2992 __glibcxx_requires_valid_range(__first, __last);
2993 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
2994
2995 return std::__next_permutation
2996 (__first, __last, __gnu_cxx::__ops::__iter_comp_iter(__comp));
2997 }
2998
2999 template<typename _BidirectionalIterator, typename _Compare>
3000 bool
3001 __prev_permutation(_BidirectionalIterator __first,
3002 _BidirectionalIterator __last, _Compare __comp)
3003 {
3004 if (__first == __last)
3005 return false;
3006 _BidirectionalIterator __i = __first;
3007 ++__i;
3008 if (__i == __last)
3009 return false;
3010 __i = __last;
3011 --__i;
3012
3013 for(;;)
3014 {
3015 _BidirectionalIterator __ii = __i;
3016 --__i;
3017 if (__comp(__ii, __i))
3018 {
3019 _BidirectionalIterator __j = __last;
3020 while (!__comp(--__j, __i))
3021 {}
3022 std::iter_swap(__i, __j);
3023 std::__reverse(__ii, __last,
3024 std::__iterator_category(__first));
3025 return true;
3026 }
3027 if (__i == __first)
3028 {
3029 std::__reverse(__first, __last,
3030 std::__iterator_category(__first));
3031 return false;
3032 }
3033 }
3034 }
3035
3036 /**
3037 * @brief Permute range into the previous @e dictionary ordering.
3038 * @ingroup sorting_algorithms
3039 * @param __first Start of range.
3040 * @param __last End of range.
3041 * @return False if wrapped to last permutation, true otherwise.
3042 *
3043 * Treats all permutations of the range as a set of @e dictionary sorted
3044 * sequences. Permutes the current sequence into the previous one of this
3045 * set. Returns true if there are more sequences to generate. If the
3046 * sequence is the smallest of the set, the largest is generated and false
3047 * returned.
3048 */
3049 template<typename _BidirectionalIterator>
3050 inline bool
3051 prev_permutation(_BidirectionalIterator __first,
3052 _BidirectionalIterator __last)
3053 {
3054 // concept requirements
3055 __glibcxx_function_requires(_BidirectionalIteratorConcept<
3056 _BidirectionalIterator>)
3057 __glibcxx_function_requires(_LessThanComparableConcept<
3058 typename iterator_traits<_BidirectionalIterator>::value_type>)
3059 __glibcxx_requires_valid_range(__first, __last);
3060 __glibcxx_requires_irreflexive(__first, __last);
3061
3062 return std::__prev_permutation(__first, __last,
3063 __gnu_cxx::__ops::__iter_less_iter());
3064 }
3065
3066 /**
3067 * @brief Permute range into the previous @e dictionary ordering using
3068 * comparison functor.
3069 * @ingroup sorting_algorithms
3070 * @param __first Start of range.
3071 * @param __last End of range.
3072 * @param __comp A comparison functor.
3073 * @return False if wrapped to last permutation, true otherwise.
3074 *
3075 * Treats all permutations of the range [__first,__last) as a set of
3076 * @e dictionary sorted sequences ordered by @p __comp. Permutes the current
3077 * sequence into the previous one of this set. Returns true if there are
3078 * more sequences to generate. If the sequence is the smallest of the set,
3079 * the largest is generated and false returned.
3080 */
3081 template<typename _BidirectionalIterator, typename _Compare>
3082 inline bool
3083 prev_permutation(_BidirectionalIterator __first,
3084 _BidirectionalIterator __last, _Compare __comp)
3085 {
3086 // concept requirements
3087 __glibcxx_function_requires(_BidirectionalIteratorConcept<
3088 _BidirectionalIterator>)
3089 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
3090 typename iterator_traits<_BidirectionalIterator>::value_type,
3091 typename iterator_traits<_BidirectionalIterator>::value_type>)
3092 __glibcxx_requires_valid_range(__first, __last);
3093 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
3094
3095 return std::__prev_permutation(__first, __last,
3096 __gnu_cxx::__ops::__iter_comp_iter(__comp));
3097 }
3098
3099 // replace
3100 // replace_if
3101
3102 template<typename _InputIterator, typename _OutputIterator,
3103 typename _Predicate, typename _Tp>
3104 _OutputIterator
3105 __replace_copy_if(_InputIterator __first, _InputIterator __last,
3106 _OutputIterator __result,
3107 _Predicate __pred, const _Tp& __new_value)
3108 {
3109 for (; __first != __last; ++__first, (void)++__result)
3110 if (__pred(__first))
3111 *__result = __new_value;
3112 else
3113 *__result = *__first;
3114 return __result;
3115 }
3116
3117 /**
3118 * @brief Copy a sequence, replacing each element of one value with another
3119 * value.
3120 * @param __first An input iterator.
3121 * @param __last An input iterator.
3122 * @param __result An output iterator.
3123 * @param __old_value The value to be replaced.
3124 * @param __new_value The replacement value.
3125 * @return The end of the output sequence, @p result+(last-first).
3126 *
3127 * Copies each element in the input range @p [__first,__last) to the
3128 * output range @p [__result,__result+(__last-__first)) replacing elements
3129 * equal to @p __old_value with @p __new_value.
3130 */
3131 template<typename _InputIterator, typename _OutputIterator, typename _Tp>
3132 inline _OutputIterator
3133 replace_copy(_InputIterator __first, _InputIterator __last,
3134 _OutputIterator __result,
3135 const _Tp& __old_value, const _Tp& __new_value)
3136 {
3137 // concept requirements
3138 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3139 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
3140 typename iterator_traits<_InputIterator>::value_type>)
3141 __glibcxx_function_requires(_EqualOpConcept<
3142 typename iterator_traits<_InputIterator>::value_type, _Tp>)
3143 __glibcxx_requires_valid_range(__first, __last);
3144
3145 return std::__replace_copy_if(__first, __last, __result,
3146 __gnu_cxx::__ops::__iter_equals_val(__old_value),
3147 __new_value);
3148 }
3149
3150 /**
3151 * @brief Copy a sequence, replacing each value for which a predicate
3152 * returns true with another value.
3153 * @ingroup mutating_algorithms
3154 * @param __first An input iterator.
3155 * @param __last An input iterator.
3156 * @param __result An output iterator.
3157 * @param __pred A predicate.
3158 * @param __new_value The replacement value.
3159 * @return The end of the output sequence, @p __result+(__last-__first).
3160 *
3161 * Copies each element in the range @p [__first,__last) to the range
3162 * @p [__result,__result+(__last-__first)) replacing elements for which
3163 * @p __pred returns true with @p __new_value.
3164 */
3165 template<typename _InputIterator, typename _OutputIterator,
3166 typename _Predicate, typename _Tp>
3167 inline _OutputIterator
3168 replace_copy_if(_InputIterator __first, _InputIterator __last,
3169 _OutputIterator __result,
3170 _Predicate __pred, const _Tp& __new_value)
3171 {
3172 // concept requirements
3173 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3174 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
3175 typename iterator_traits<_InputIterator>::value_type>)
3176 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
3177 typename iterator_traits<_InputIterator>::value_type>)
3178 __glibcxx_requires_valid_range(__first, __last);
3179
3180 return std::__replace_copy_if(__first, __last, __result,
3181 __gnu_cxx::__ops::__pred_iter(__pred),
3182 __new_value);
3183 }
3184
3185 template<typename _InputIterator, typename _Predicate>
3186 typename iterator_traits<_InputIterator>::difference_type
3187 __count_if(_InputIterator __first, _InputIterator __last, _Predicate __pred)
3188 {
3189 typename iterator_traits<_InputIterator>::difference_type __n = 0;
3190 for (; __first != __last; ++__first)
3191 if (__pred(__first))
3192 ++__n;
3193 return __n;
3194 }
3195
3196#if __cplusplus201402L >= 201103L
3197 /**
3198 * @brief Determines whether the elements of a sequence are sorted.
3199 * @ingroup sorting_algorithms
3200 * @param __first An iterator.
3201 * @param __last Another iterator.
3202 * @return True if the elements are sorted, false otherwise.
3203 */
3204 template<typename _ForwardIterator>
3205 inline bool
3206 is_sorted(_ForwardIterator __first, _ForwardIterator __last)
3207 { return std::is_sorted_until(__first, __last) == __last; }
3208
3209 /**
3210 * @brief Determines whether the elements of a sequence are sorted
3211 * according to a comparison functor.
3212 * @ingroup sorting_algorithms
3213 * @param __first An iterator.
3214 * @param __last Another iterator.
3215 * @param __comp A comparison functor.
3216 * @return True if the elements are sorted, false otherwise.
3217 */
3218 template<typename _ForwardIterator, typename _Compare>
3219 inline bool
3220 is_sorted(_ForwardIterator __first, _ForwardIterator __last,
3221 _Compare __comp)
3222 { return std::is_sorted_until(__first, __last, __comp) == __last; }
3223
3224 template<typename _ForwardIterator, typename _Compare>
3225 _ForwardIterator
3226 __is_sorted_until(_ForwardIterator __first, _ForwardIterator __last,
3227 _Compare __comp)
3228 {
3229 if (__first == __last)
3230 return __last;
3231
3232 _ForwardIterator __next = __first;
3233 for (++__next; __next != __last; __first = __next, (void)++__next)
3234 if (__comp(__next, __first))
3235 return __next;
3236 return __next;
3237 }
3238
3239 /**
3240 * @brief Determines the end of a sorted sequence.
3241 * @ingroup sorting_algorithms
3242 * @param __first An iterator.
3243 * @param __last Another iterator.
3244 * @return An iterator pointing to the last iterator i in [__first, __last)
3245 * for which the range [__first, i) is sorted.
3246 */
3247 template<typename _ForwardIterator>
3248 inline _ForwardIterator
3249 is_sorted_until(_ForwardIterator __first, _ForwardIterator __last)
3250 {
3251 // concept requirements
3252 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3253 __glibcxx_function_requires(_LessThanComparableConcept<
3254 typename iterator_traits<_ForwardIterator>::value_type>)
3255 __glibcxx_requires_valid_range(__first, __last);
3256 __glibcxx_requires_irreflexive(__first, __last);
3257
3258 return std::__is_sorted_until(__first, __last,
3259 __gnu_cxx::__ops::__iter_less_iter());
3260 }
3261
3262 /**
3263 * @brief Determines the end of a sorted sequence using comparison functor.
3264 * @ingroup sorting_algorithms
3265 * @param __first An iterator.
3266 * @param __last Another iterator.
3267 * @param __comp A comparison functor.
3268 * @return An iterator pointing to the last iterator i in [__first, __last)
3269 * for which the range [__first, i) is sorted.
3270 */
3271 template<typename _ForwardIterator, typename _Compare>
3272 inline _ForwardIterator
3273 is_sorted_until(_ForwardIterator __first, _ForwardIterator __last,
3274 _Compare __comp)
3275 {
3276 // concept requirements
3277 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3278 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
3279 typename iterator_traits<_ForwardIterator>::value_type,
3280 typename iterator_traits<_ForwardIterator>::value_type>)
3281 __glibcxx_requires_valid_range(__first, __last);
3282 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
3283
3284 return std::__is_sorted_until(__first, __last,
3285 __gnu_cxx::__ops::__iter_comp_iter(__comp));
3286 }
3287
3288 /**
3289 * @brief Determines min and max at once as an ordered pair.
3290 * @ingroup sorting_algorithms
3291 * @param __a A thing of arbitrary type.
3292 * @param __b Another thing of arbitrary type.
3293 * @return A pair(__b, __a) if __b is smaller than __a, pair(__a,
3294 * __b) otherwise.
3295 */
3296 template<typename _Tp>
3297 _GLIBCXX14_CONSTEXPRconstexpr
3298 inline pair<const _Tp&, const _Tp&>
3299 minmax(const _Tp& __a, const _Tp& __b)
3300 {
3301 // concept requirements
3302 __glibcxx_function_requires(_LessThanComparableConcept<_Tp>)
3303
3304 return __b < __a ? pair<const _Tp&, const _Tp&>(__b, __a)
3305 : pair<const _Tp&, const _Tp&>(__a, __b);
3306 }
3307
3308 /**
3309 * @brief Determines min and max at once as an ordered pair.
3310 * @ingroup sorting_algorithms
3311 * @param __a A thing of arbitrary type.
3312 * @param __b Another thing of arbitrary type.
3313 * @param __comp A @link comparison_functors comparison functor @endlink.
3314 * @return A pair(__b, __a) if __b is smaller than __a, pair(__a,
3315 * __b) otherwise.
3316 */
3317 template<typename _Tp, typename _Compare>
3318 _GLIBCXX14_CONSTEXPRconstexpr
3319 inline pair<const _Tp&, const _Tp&>
3320 minmax(const _Tp& __a, const _Tp& __b, _Compare __comp)
3321 {
3322 return __comp(__b, __a) ? pair<const _Tp&, const _Tp&>(__b, __a)
3323 : pair<const _Tp&, const _Tp&>(__a, __b);
3324 }
3325
3326 template<typename _ForwardIterator, typename _Compare>
3327 _GLIBCXX14_CONSTEXPRconstexpr
3328 pair<_ForwardIterator, _ForwardIterator>
3329 __minmax_element(_ForwardIterator __first, _ForwardIterator __last,
3330 _Compare __comp)
3331 {
3332 _ForwardIterator __next = __first;
3333 if (__first == __last
3334 || ++__next == __last)
3335 return std::make_pair(__first, __first);
3336
3337 _ForwardIterator __min{}, __max{};
3338 if (__comp(__next, __first))
3339 {
3340 __min = __next;
3341 __max = __first;
3342 }
3343 else
3344 {
3345 __min = __first;
3346 __max = __next;
3347 }
3348
3349 __first = __next;
3350 ++__first;
3351
3352 while (__first != __last)
3353 {
3354 __next = __first;
3355 if (++__next == __last)
3356 {
3357 if (__comp(__first, __min))
3358 __min = __first;
3359 else if (!__comp(__first, __max))
3360 __max = __first;
3361 break;
3362 }
3363
3364 if (__comp(__next, __first))
3365 {
3366 if (__comp(__next, __min))
3367 __min = __next;
3368 if (!__comp(__first, __max))
3369 __max = __first;
3370 }
3371 else
3372 {
3373 if (__comp(__first, __min))
3374 __min = __first;
3375 if (!__comp(__next, __max))
3376 __max = __next;
3377 }
3378
3379 __first = __next;
3380 ++__first;
3381 }
3382
3383 return std::make_pair(__min, __max);
3384 }
3385
3386 /**
3387 * @brief Return a pair of iterators pointing to the minimum and maximum
3388 * elements in a range.
3389 * @ingroup sorting_algorithms
3390 * @param __first Start of range.
3391 * @param __last End of range.
3392 * @return make_pair(m, M), where m is the first iterator i in
3393 * [__first, __last) such that no other element in the range is
3394 * smaller, and where M is the last iterator i in [__first, __last)
3395 * such that no other element in the range is larger.
3396 */
3397 template<typename _ForwardIterator>
3398 _GLIBCXX14_CONSTEXPRconstexpr
3399 inline pair<_ForwardIterator, _ForwardIterator>
3400 minmax_element(_ForwardIterator __first, _ForwardIterator __last)
3401 {
3402 // concept requirements
3403 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3404 __glibcxx_function_requires(_LessThanComparableConcept<
3405 typename iterator_traits<_ForwardIterator>::value_type>)
3406 __glibcxx_requires_valid_range(__first, __last);
3407 __glibcxx_requires_irreflexive(__first, __last);
3408
3409 return std::__minmax_element(__first, __last,
3410 __gnu_cxx::__ops::__iter_less_iter());
3411 }
3412
3413 /**
3414 * @brief Return a pair of iterators pointing to the minimum and maximum
3415 * elements in a range.
3416 * @ingroup sorting_algorithms
3417 * @param __first Start of range.
3418 * @param __last End of range.
3419 * @param __comp Comparison functor.
3420 * @return make_pair(m, M), where m is the first iterator i in
3421 * [__first, __last) such that no other element in the range is
3422 * smaller, and where M is the last iterator i in [__first, __last)
3423 * such that no other element in the range is larger.
3424 */
3425 template<typename _ForwardIterator, typename _Compare>
3426 _GLIBCXX14_CONSTEXPRconstexpr
3427 inline pair<_ForwardIterator, _ForwardIterator>
3428 minmax_element(_ForwardIterator __first, _ForwardIterator __last,
3429 _Compare __comp)
3430 {
3431 // concept requirements
3432 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3433 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
3434 typename iterator_traits<_ForwardIterator>::value_type,
3435 typename iterator_traits<_ForwardIterator>::value_type>)
3436 __glibcxx_requires_valid_range(__first, __last);
3437 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
3438
3439 return std::__minmax_element(__first, __last,
3440 __gnu_cxx::__ops::__iter_comp_iter(__comp));
3441 }
3442
3443 // N2722 + DR 915.
3444 template<typename _Tp>
3445 _GLIBCXX14_CONSTEXPRconstexpr
3446 inline _Tp
3447 min(initializer_list<_Tp> __l)
3448 { return *std::min_element(__l.begin(), __l.end()); }
3449
3450 template<typename _Tp, typename _Compare>
3451 _GLIBCXX14_CONSTEXPRconstexpr
3452 inline _Tp
3453 min(initializer_list<_Tp> __l, _Compare __comp)
3454 { return *std::min_element(__l.begin(), __l.end(), __comp); }
3455
3456 template<typename _Tp>
3457 _GLIBCXX14_CONSTEXPRconstexpr
3458 inline _Tp
3459 max(initializer_list<_Tp> __l)
3460 { return *std::max_element(__l.begin(), __l.end()); }
3461
3462 template<typename _Tp, typename _Compare>
3463 _GLIBCXX14_CONSTEXPRconstexpr
3464 inline _Tp
3465 max(initializer_list<_Tp> __l, _Compare __comp)
3466 { return *std::max_element(__l.begin(), __l.end(), __comp); }
3467
3468 template<typename _Tp>
3469 _GLIBCXX14_CONSTEXPRconstexpr
3470 inline pair<_Tp, _Tp>
3471 minmax(initializer_list<_Tp> __l)
3472 {
3473 pair<const _Tp*, const _Tp*> __p =
3474 std::minmax_element(__l.begin(), __l.end());
3475 return std::make_pair(*__p.first, *__p.second);
3476 }
3477
3478 template<typename _Tp, typename _Compare>
3479 _GLIBCXX14_CONSTEXPRconstexpr
3480 inline pair<_Tp, _Tp>
3481 minmax(initializer_list<_Tp> __l, _Compare __comp)
3482 {
3483 pair<const _Tp*, const _Tp*> __p =
3484 std::minmax_element(__l.begin(), __l.end(), __comp);
3485 return std::make_pair(*__p.first, *__p.second);
3486 }
3487
3488 template<typename _ForwardIterator1, typename _ForwardIterator2,
3489 typename _BinaryPredicate>
3490 bool
3491 __is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3492 _ForwardIterator2 __first2, _BinaryPredicate __pred)
3493 {
3494 // Efficiently compare identical prefixes: O(N) if sequences
3495 // have the same elements in the same order.
3496 for (; __first1 != __last1; ++__first1, (void)++__first2)
3497 if (!__pred(__first1, __first2))
3498 break;
3499
3500 if (__first1 == __last1)
3501 return true;
3502
3503 // Establish __last2 assuming equal ranges by iterating over the
3504 // rest of the list.
3505 _ForwardIterator2 __last2 = __first2;
3506 std::advance(__last2, std::distance(__first1, __last1));
3507 for (_ForwardIterator1 __scan = __first1; __scan != __last1; ++__scan)
3508 {
3509 if (__scan != std::__find_if(__first1, __scan,
3510 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan)))
3511 continue; // We've seen this one before.
3512
3513 auto __matches
3514 = std::__count_if(__first2, __last2,
3515 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan));
3516 if (0 == __matches ||
3517 std::__count_if(__scan, __last1,
3518 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan))
3519 != __matches)
3520 return false;
3521 }
3522 return true;
3523 }
3524
3525 /**
3526 * @brief Checks whether a permutation of the second sequence is equal
3527 * to the first sequence.
3528 * @ingroup non_mutating_algorithms
3529 * @param __first1 Start of first range.
3530 * @param __last1 End of first range.
3531 * @param __first2 Start of second range.
3532 * @return true if there exists a permutation of the elements in the range
3533 * [__first2, __first2 + (__last1 - __first1)), beginning with
3534 * ForwardIterator2 begin, such that equal(__first1, __last1, begin)
3535 * returns true; otherwise, returns false.
3536 */
3537 template<typename _ForwardIterator1, typename _ForwardIterator2>
3538 inline bool
3539 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3540 _ForwardIterator2 __first2)
3541 {
3542 // concept requirements
3543 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
3544 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
3545 __glibcxx_function_requires(_EqualOpConcept<
3546 typename iterator_traits<_ForwardIterator1>::value_type,
3547 typename iterator_traits<_ForwardIterator2>::value_type>)
3548 __glibcxx_requires_valid_range(__first1, __last1);
3549
3550 return std::__is_permutation(__first1, __last1, __first2,
3551 __gnu_cxx::__ops::__iter_equal_to_iter());
3552 }
3553
3554 /**
3555 * @brief Checks whether a permutation of the second sequence is equal
3556 * to the first sequence.
3557 * @ingroup non_mutating_algorithms
3558 * @param __first1 Start of first range.
3559 * @param __last1 End of first range.
3560 * @param __first2 Start of second range.
3561 * @param __pred A binary predicate.
3562 * @return true if there exists a permutation of the elements in
3563 * the range [__first2, __first2 + (__last1 - __first1)),
3564 * beginning with ForwardIterator2 begin, such that
3565 * equal(__first1, __last1, __begin, __pred) returns true;
3566 * otherwise, returns false.
3567 */
3568 template<typename _ForwardIterator1, typename _ForwardIterator2,
3569 typename _BinaryPredicate>
3570 inline bool
3571 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3572 _ForwardIterator2 __first2, _BinaryPredicate __pred)
3573 {
3574 // concept requirements
3575 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
3576 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
3577 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
3578 typename iterator_traits<_ForwardIterator1>::value_type,
3579 typename iterator_traits<_ForwardIterator2>::value_type>)
3580 __glibcxx_requires_valid_range(__first1, __last1);
3581
3582 return std::__is_permutation(__first1, __last1, __first2,
3583 __gnu_cxx::__ops::__iter_comp_iter(__pred));
3584 }
3585
3586#if __cplusplus201402L > 201103L
3587 template<typename _ForwardIterator1, typename _ForwardIterator2,
3588 typename _BinaryPredicate>
3589 bool
3590 __is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3591 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
3592 _BinaryPredicate __pred)
3593 {
3594 using _Cat1
3595 = typename iterator_traits<_ForwardIterator1>::iterator_category;
3596 using _Cat2
3597 = typename iterator_traits<_ForwardIterator2>::iterator_category;
3598 using _It1_is_RA = is_same<_Cat1, random_access_iterator_tag>;
3599 using _It2_is_RA = is_same<_Cat2, random_access_iterator_tag>;
3600 constexpr bool __ra_iters = _It1_is_RA() && _It2_is_RA();
3601 if (__ra_iters)
3602 {
3603 auto __d1 = std::distance(__first1, __last1);
3604 auto __d2 = std::distance(__first2, __last2);
3605 if (__d1 != __d2)
3606 return false;
3607 }
3608
3609 // Efficiently compare identical prefixes: O(N) if sequences
3610 // have the same elements in the same order.
3611 for (; __first1 != __last1 && __first2 != __last2;
3612 ++__first1, (void)++__first2)
3613 if (!__pred(__first1, __first2))
3614 break;
3615
3616 if (__ra_iters)
3617 {
3618 if (__first1 == __last1)
3619 return true;
3620 }
3621 else
3622 {
3623 auto __d1 = std::distance(__first1, __last1);
3624 auto __d2 = std::distance(__first2, __last2);
3625 if (__d1 == 0 && __d2 == 0)
3626 return true;
3627 if (__d1 != __d2)
3628 return false;
3629 }
3630
3631 for (_ForwardIterator1 __scan = __first1; __scan != __last1; ++__scan)
3632 {
3633 if (__scan != std::__find_if(__first1, __scan,
3634 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan)))
3635 continue; // We've seen this one before.
3636
3637 auto __matches = std::__count_if(__first2, __last2,
3638 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan));
3639 if (0 == __matches
3640 || std::__count_if(__scan, __last1,
3641 __gnu_cxx::__ops::__iter_comp_iter(__pred, __scan))
3642 != __matches)
3643 return false;
3644 }
3645 return true;
3646 }
3647
3648 /**
3649 * @brief Checks whether a permutaion of the second sequence is equal
3650 * to the first sequence.
3651 * @ingroup non_mutating_algorithms
3652 * @param __first1 Start of first range.
3653 * @param __last1 End of first range.
3654 * @param __first2 Start of second range.
3655 * @param __last2 End of first range.
3656 * @return true if there exists a permutation of the elements in the range
3657 * [__first2, __last2), beginning with ForwardIterator2 begin,
3658 * such that equal(__first1, __last1, begin) returns true;
3659 * otherwise, returns false.
3660 */
3661 template<typename _ForwardIterator1, typename _ForwardIterator2>
3662 inline bool
3663 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3664 _ForwardIterator2 __first2, _ForwardIterator2 __last2)
3665 {
3666 __glibcxx_requires_valid_range(__first1, __last1);
3667 __glibcxx_requires_valid_range(__first2, __last2);
3668
3669 return
3670 std::__is_permutation(__first1, __last1, __first2, __last2,
3671 __gnu_cxx::__ops::__iter_equal_to_iter());
3672 }
3673
3674 /**
3675 * @brief Checks whether a permutation of the second sequence is equal
3676 * to the first sequence.
3677 * @ingroup non_mutating_algorithms
3678 * @param __first1 Start of first range.
3679 * @param __last1 End of first range.
3680 * @param __first2 Start of second range.
3681 * @param __last2 End of first range.
3682 * @param __pred A binary predicate.
3683 * @return true if there exists a permutation of the elements in the range
3684 * [__first2, __last2), beginning with ForwardIterator2 begin,
3685 * such that equal(__first1, __last1, __begin, __pred) returns true;
3686 * otherwise, returns false.
3687 */
3688 template<typename _ForwardIterator1, typename _ForwardIterator2,
3689 typename _BinaryPredicate>
3690 inline bool
3691 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
3692 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
3693 _BinaryPredicate __pred)
3694 {
3695 __glibcxx_requires_valid_range(__first1, __last1);
3696 __glibcxx_requires_valid_range(__first2, __last2);
3697
3698 return std::__is_permutation(__first1, __last1, __first2, __last2,
3699 __gnu_cxx::__ops::__iter_comp_iter(__pred));
3700 }
3701#endif
3702
3703#ifdef _GLIBCXX_USE_C99_STDINT_TR11
3704 /**
3705 * @brief Shuffle the elements of a sequence using a uniform random
3706 * number generator.
3707 * @ingroup mutating_algorithms
3708 * @param __first A forward iterator.
3709 * @param __last A forward iterator.
3710 * @param __g A UniformRandomNumberGenerator (26.5.1.3).
3711 * @return Nothing.
3712 *
3713 * Reorders the elements in the range @p [__first,__last) using @p __g to
3714 * provide random numbers.
3715 */
3716 template<typename _RandomAccessIterator,
3717 typename _UniformRandomNumberGenerator>
3718 void
3719 shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last,
3720 _UniformRandomNumberGenerator&& __g)
3721 {
3722 // concept requirements
3723 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
3724 _RandomAccessIterator>)
3725 __glibcxx_requires_valid_range(__first, __last);
3726
3727 if (__first == __last)
3728 return;
3729
3730 typedef typename iterator_traits<_RandomAccessIterator>::difference_type
3731 _DistanceType;
3732
3733 typedef typename std::make_unsigned<_DistanceType>::type __ud_type;
3734 typedef typename std::uniform_int_distribution<__ud_type> __distr_type;
3735 typedef typename __distr_type::param_type __p_type;
3736 __distr_type __d;
3737
3738 for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)
3739 std::iter_swap(__i, __first + __d(__g, __p_type(0, __i - __first)));
3740 }
3741#endif
3742
3743#endif // C++11
3744
3745_GLIBCXX_END_NAMESPACE_VERSION
3746
3747_GLIBCXX_BEGIN_NAMESPACE_ALGO
3748
3749 /**
3750 * @brief Apply a function to every element of a sequence.
3751 * @ingroup non_mutating_algorithms
3752 * @param __first An input iterator.
3753 * @param __last An input iterator.
3754 * @param __f A unary function object.
3755 * @return @p __f (std::move(@p __f) in C++0x).
3756 *
3757 * Applies the function object @p __f to each element in the range
3758 * @p [first,last). @p __f must not modify the order of the sequence.
3759 * If @p __f has a return value it is ignored.
3760 */
3761 template<typename _InputIterator, typename _Function>
3762 _Function
3763 for_each(_InputIterator __first, _InputIterator __last, _Function __f)
3764 {
3765 // concept requirements
3766 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3767 __glibcxx_requires_valid_range(__first, __last);
3768 for (; __first != __last; ++__first)
3769 __f(*__first);
3770 return _GLIBCXX_MOVE(__f)std::move(__f);
3771 }
3772
3773 /**
3774 * @brief Find the first occurrence of a value in a sequence.
3775 * @ingroup non_mutating_algorithms
3776 * @param __first An input iterator.
3777 * @param __last An input iterator.
3778 * @param __val The value to find.
3779 * @return The first iterator @c i in the range @p [__first,__last)
3780 * such that @c *i == @p __val, or @p __last if no such iterator exists.
3781 */
3782 template<typename _InputIterator, typename _Tp>
3783 inline _InputIterator
3784 find(_InputIterator __first, _InputIterator __last,
3785 const _Tp& __val)
3786 {
3787 // concept requirements
3788 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3789 __glibcxx_function_requires(_EqualOpConcept<
3790 typename iterator_traits<_InputIterator>::value_type, _Tp>)
3791 __glibcxx_requires_valid_range(__first, __last);
3792 return std::__find_if(__first, __last,
3793 __gnu_cxx::__ops::__iter_equals_val(__val));
3794 }
3795
3796 /**
3797 * @brief Find the first element in a sequence for which a
3798 * predicate is true.
3799 * @ingroup non_mutating_algorithms
3800 * @param __first An input iterator.
3801 * @param __last An input iterator.
3802 * @param __pred A predicate.
3803 * @return The first iterator @c i in the range @p [__first,__last)
3804 * such that @p __pred(*i) is true, or @p __last if no such iterator exists.
3805 */
3806 template<typename _InputIterator, typename _Predicate>
3807 inline _InputIterator
3808 find_if(_InputIterator __first, _InputIterator __last,
3809 _Predicate __pred)
3810 {
3811 // concept requirements
3812 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3813 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
3814 typename iterator_traits<_InputIterator>::value_type>)
3815 __glibcxx_requires_valid_range(__first, __last);
3816
3817 return std::__find_if(__first, __last,
3818 __gnu_cxx::__ops::__pred_iter(__pred));
3819 }
3820
3821 /**
3822 * @brief Find element from a set in a sequence.
3823 * @ingroup non_mutating_algorithms
3824 * @param __first1 Start of range to search.
3825 * @param __last1 End of range to search.
3826 * @param __first2 Start of match candidates.
3827 * @param __last2 End of match candidates.
3828 * @return The first iterator @c i in the range
3829 * @p [__first1,__last1) such that @c *i == @p *(i2) such that i2 is an
3830 * iterator in [__first2,__last2), or @p __last1 if no such iterator exists.
3831 *
3832 * Searches the range @p [__first1,__last1) for an element that is
3833 * equal to some element in the range [__first2,__last2). If
3834 * found, returns an iterator in the range [__first1,__last1),
3835 * otherwise returns @p __last1.
3836 */
3837 template<typename _InputIterator, typename _ForwardIterator>
3838 _InputIterator
3839 find_first_of(_InputIterator __first1, _InputIterator __last1,
3840 _ForwardIterator __first2, _ForwardIterator __last2)
3841 {
3842 // concept requirements
3843 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3844 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3845 __glibcxx_function_requires(_EqualOpConcept<
3846 typename iterator_traits<_InputIterator>::value_type,
3847 typename iterator_traits<_ForwardIterator>::value_type>)
3848 __glibcxx_requires_valid_range(__first1, __last1);
3849 __glibcxx_requires_valid_range(__first2, __last2);
3850
3851 for (; __first1 != __last1; ++__first1)
3852 for (_ForwardIterator __iter = __first2; __iter != __last2; ++__iter)
3853 if (*__first1 == *__iter)
3854 return __first1;
3855 return __last1;
3856 }
3857
3858 /**
3859 * @brief Find element from a set in a sequence using a predicate.
3860 * @ingroup non_mutating_algorithms
3861 * @param __first1 Start of range to search.
3862 * @param __last1 End of range to search.
3863 * @param __first2 Start of match candidates.
3864 * @param __last2 End of match candidates.
3865 * @param __comp Predicate to use.
3866 * @return The first iterator @c i in the range
3867 * @p [__first1,__last1) such that @c comp(*i, @p *(i2)) is true
3868 * and i2 is an iterator in [__first2,__last2), or @p __last1 if no
3869 * such iterator exists.
3870 *
3871
3872 * Searches the range @p [__first1,__last1) for an element that is
3873 * equal to some element in the range [__first2,__last2). If
3874 * found, returns an iterator in the range [__first1,__last1),
3875 * otherwise returns @p __last1.
3876 */
3877 template<typename _InputIterator, typename _ForwardIterator,
3878 typename _BinaryPredicate>
3879 _InputIterator
3880 find_first_of(_InputIterator __first1, _InputIterator __last1,
3881 _ForwardIterator __first2, _ForwardIterator __last2,
3882 _BinaryPredicate __comp)
3883 {
3884 // concept requirements
3885 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3886 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3887 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
3888 typename iterator_traits<_InputIterator>::value_type,
3889 typename iterator_traits<_ForwardIterator>::value_type>)
3890 __glibcxx_requires_valid_range(__first1, __last1);
3891 __glibcxx_requires_valid_range(__first2, __last2);
3892
3893 for (; __first1 != __last1; ++__first1)
3894 for (_ForwardIterator __iter = __first2; __iter != __last2; ++__iter)
3895 if (__comp(*__first1, *__iter))
3896 return __first1;
3897 return __last1;
3898 }
3899
3900 /**
3901 * @brief Find two adjacent values in a sequence that are equal.
3902 * @ingroup non_mutating_algorithms
3903 * @param __first A forward iterator.
3904 * @param __last A forward iterator.
3905 * @return The first iterator @c i such that @c i and @c i+1 are both
3906 * valid iterators in @p [__first,__last) and such that @c *i == @c *(i+1),
3907 * or @p __last if no such iterator exists.
3908 */
3909 template<typename _ForwardIterator>
3910 inline _ForwardIterator
3911 adjacent_find(_ForwardIterator __first, _ForwardIterator __last)
3912 {
3913 // concept requirements
3914 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3915 __glibcxx_function_requires(_EqualityComparableConcept<
3916 typename iterator_traits<_ForwardIterator>::value_type>)
3917 __glibcxx_requires_valid_range(__first, __last);
3918
3919 return std::__adjacent_find(__first, __last,
3920 __gnu_cxx::__ops::__iter_equal_to_iter());
3921 }
3922
3923 /**
3924 * @brief Find two adjacent values in a sequence using a predicate.
3925 * @ingroup non_mutating_algorithms
3926 * @param __first A forward iterator.
3927 * @param __last A forward iterator.
3928 * @param __binary_pred A binary predicate.
3929 * @return The first iterator @c i such that @c i and @c i+1 are both
3930 * valid iterators in @p [__first,__last) and such that
3931 * @p __binary_pred(*i,*(i+1)) is true, or @p __last if no such iterator
3932 * exists.
3933 */
3934 template<typename _ForwardIterator, typename _BinaryPredicate>
3935 inline _ForwardIterator
3936 adjacent_find(_ForwardIterator __first, _ForwardIterator __last,
3937 _BinaryPredicate __binary_pred)
3938 {
3939 // concept requirements
3940 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
3941 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
3942 typename iterator_traits<_ForwardIterator>::value_type,
3943 typename iterator_traits<_ForwardIterator>::value_type>)
3944 __glibcxx_requires_valid_range(__first, __last);
3945
3946 return std::__adjacent_find(__first, __last,
3947 __gnu_cxx::__ops::__iter_comp_iter(__binary_pred));
3948 }
3949
3950 /**
3951 * @brief Count the number of copies of a value in a sequence.
3952 * @ingroup non_mutating_algorithms
3953 * @param __first An input iterator.
3954 * @param __last An input iterator.
3955 * @param __value The value to be counted.
3956 * @return The number of iterators @c i in the range @p [__first,__last)
3957 * for which @c *i == @p __value
3958 */
3959 template<typename _InputIterator, typename _Tp>
3960 inline typename iterator_traits<_InputIterator>::difference_type
3961 count(_InputIterator __first, _InputIterator __last, const _Tp& __value)
3962 {
3963 // concept requirements
3964 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3965 __glibcxx_function_requires(_EqualOpConcept<
3966 typename iterator_traits<_InputIterator>::value_type, _Tp>)
3967 __glibcxx_requires_valid_range(__first, __last);
3968
3969 return std::__count_if(__first, __last,
3970 __gnu_cxx::__ops::__iter_equals_val(__value));
3971 }
3972
3973 /**
3974 * @brief Count the elements of a sequence for which a predicate is true.
3975 * @ingroup non_mutating_algorithms
3976 * @param __first An input iterator.
3977 * @param __last An input iterator.
3978 * @param __pred A predicate.
3979 * @return The number of iterators @c i in the range @p [__first,__last)
3980 * for which @p __pred(*i) is true.
3981 */
3982 template<typename _InputIterator, typename _Predicate>
3983 inline typename iterator_traits<_InputIterator>::difference_type
3984 count_if(_InputIterator __first, _InputIterator __last, _Predicate __pred)
3985 {
3986 // concept requirements
3987 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
3988 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
3989 typename iterator_traits<_InputIterator>::value_type>)
3990 __glibcxx_requires_valid_range(__first, __last);
3991
3992 return std::__count_if(__first, __last,
3993 __gnu_cxx::__ops::__pred_iter(__pred));
3994 }
3995
3996 /**
3997 * @brief Search a sequence for a matching sub-sequence.
3998 * @ingroup non_mutating_algorithms
3999 * @param __first1 A forward iterator.
4000 * @param __last1 A forward iterator.
4001 * @param __first2 A forward iterator.
4002 * @param __last2 A forward iterator.
4003 * @return The first iterator @c i in the range @p
4004 * [__first1,__last1-(__last2-__first2)) such that @c *(i+N) == @p
4005 * *(__first2+N) for each @c N in the range @p
4006 * [0,__last2-__first2), or @p __last1 if no such iterator exists.
4007 *
4008 * Searches the range @p [__first1,__last1) for a sub-sequence that
4009 * compares equal value-by-value with the sequence given by @p
4010 * [__first2,__last2) and returns an iterator to the first element
4011 * of the sub-sequence, or @p __last1 if the sub-sequence is not
4012 * found.
4013 *
4014 * Because the sub-sequence must lie completely within the range @p
4015 * [__first1,__last1) it must start at a position less than @p
4016 * __last1-(__last2-__first2) where @p __last2-__first2 is the
4017 * length of the sub-sequence.
4018 *
4019 * This means that the returned iterator @c i will be in the range
4020 * @p [__first1,__last1-(__last2-__first2))
4021 */
4022 template<typename _ForwardIterator1, typename _ForwardIterator2>
4023 inline _ForwardIterator1
4024 search(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
4025 _ForwardIterator2 __first2, _ForwardIterator2 __last2)
4026 {
4027 // concept requirements
4028 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
4029 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
4030 __glibcxx_function_requires(_EqualOpConcept<
4031 typename iterator_traits<_ForwardIterator1>::value_type,
4032 typename iterator_traits<_ForwardIterator2>::value_type>)
4033 __glibcxx_requires_valid_range(__first1, __last1);
4034 __glibcxx_requires_valid_range(__first2, __last2);
4035
4036 return std::__search(__first1, __last1, __first2, __last2,
4037 __gnu_cxx::__ops::__iter_equal_to_iter());
4038 }
4039
4040 /**
4041 * @brief Search a sequence for a matching sub-sequence using a predicate.
4042 * @ingroup non_mutating_algorithms
4043 * @param __first1 A forward iterator.
4044 * @param __last1 A forward iterator.
4045 * @param __first2 A forward iterator.
4046 * @param __last2 A forward iterator.
4047 * @param __predicate A binary predicate.
4048 * @return The first iterator @c i in the range
4049 * @p [__first1,__last1-(__last2-__first2)) such that
4050 * @p __predicate(*(i+N),*(__first2+N)) is true for each @c N in the range
4051 * @p [0,__last2-__first2), or @p __last1 if no such iterator exists.
4052 *
4053 * Searches the range @p [__first1,__last1) for a sub-sequence that
4054 * compares equal value-by-value with the sequence given by @p
4055 * [__first2,__last2), using @p __predicate to determine equality,
4056 * and returns an iterator to the first element of the
4057 * sub-sequence, or @p __last1 if no such iterator exists.
4058 *
4059 * @see search(_ForwardIter1, _ForwardIter1, _ForwardIter2, _ForwardIter2)
4060 */
4061 template<typename _ForwardIterator1, typename _ForwardIterator2,
4062 typename _BinaryPredicate>
4063 inline _ForwardIterator1
4064 search(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
4065 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
4066 _BinaryPredicate __predicate)
4067 {
4068 // concept requirements
4069 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator1>)
4070 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator2>)
4071 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
4072 typename iterator_traits<_ForwardIterator1>::value_type,
4073 typename iterator_traits<_ForwardIterator2>::value_type>)
4074 __glibcxx_requires_valid_range(__first1, __last1);
4075 __glibcxx_requires_valid_range(__first2, __last2);
4076
4077 return std::__search(__first1, __last1, __first2, __last2,
4078 __gnu_cxx::__ops::__iter_comp_iter(__predicate));
4079 }
4080
4081 /**
4082 * @brief Search a sequence for a number of consecutive values.
4083 * @ingroup non_mutating_algorithms
4084 * @param __first A forward iterator.
4085 * @param __last A forward iterator.
4086 * @param __count The number of consecutive values.
4087 * @param __val The value to find.
4088 * @return The first iterator @c i in the range @p
4089 * [__first,__last-__count) such that @c *(i+N) == @p __val for
4090 * each @c N in the range @p [0,__count), or @p __last if no such
4091 * iterator exists.
4092 *
4093 * Searches the range @p [__first,__last) for @p count consecutive elements
4094 * equal to @p __val.
4095 */
4096 template<typename _ForwardIterator, typename _Integer, typename _Tp>
4097 inline _ForwardIterator
4098 search_n(_ForwardIterator __first, _ForwardIterator __last,
4099 _Integer __count, const _Tp& __val)
4100 {
4101 // concept requirements
4102 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
4103 __glibcxx_function_requires(_EqualOpConcept<
4104 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
4105 __glibcxx_requires_valid_range(__first, __last);
4106
4107 return std::__search_n(__first, __last, __count,
4108 __gnu_cxx::__ops::__iter_equals_val(__val));
4109 }
4110
4111
4112 /**
4113 * @brief Search a sequence for a number of consecutive values using a
4114 * predicate.
4115 * @ingroup non_mutating_algorithms
4116 * @param __first A forward iterator.
4117 * @param __last A forward iterator.
4118 * @param __count The number of consecutive values.
4119 * @param __val The value to find.
4120 * @param __binary_pred A binary predicate.
4121 * @return The first iterator @c i in the range @p
4122 * [__first,__last-__count) such that @p
4123 * __binary_pred(*(i+N),__val) is true for each @c N in the range
4124 * @p [0,__count), or @p __last if no such iterator exists.
4125 *
4126 * Searches the range @p [__first,__last) for @p __count
4127 * consecutive elements for which the predicate returns true.
4128 */
4129 template<typename _ForwardIterator, typename _Integer, typename _Tp,
4130 typename _BinaryPredicate>
4131 inline _ForwardIterator
4132 search_n(_ForwardIterator __first, _ForwardIterator __last,
4133 _Integer __count, const _Tp& __val,
4134 _BinaryPredicate __binary_pred)
4135 {
4136 // concept requirements
4137 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
4138 __glibcxx_function_requires(_BinaryPredicateConcept<_BinaryPredicate,
4139 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
4140 __glibcxx_requires_valid_range(__first, __last);
4141
4142 return std::__search_n(__first, __last, __count,
4143 __gnu_cxx::__ops::__iter_comp_val(__binary_pred, __val));
4144 }
4145
4146
4147 /**
4148 * @brief Perform an operation on a sequence.
4149 * @ingroup mutating_algorithms
4150 * @param __first An input iterator.
4151 * @param __last An input iterator.
4152 * @param __result An output iterator.
4153 * @param __unary_op A unary operator.
4154 * @return An output iterator equal to @p __result+(__last-__first).
4155 *
4156 * Applies the operator to each element in the input range and assigns
4157 * the results to successive elements of the output sequence.
4158 * Evaluates @p *(__result+N)=unary_op(*(__first+N)) for each @c N in the
4159 * range @p [0,__last-__first).
4160 *
4161 * @p unary_op must not alter its argument.
4162 */
4163 template<typename _InputIterator, typename _OutputIterator,
4164 typename _UnaryOperation>
4165 _OutputIterator
4166 transform(_InputIterator __first, _InputIterator __last,
4167 _OutputIterator __result, _UnaryOperation __unary_op)
4168 {
4169 // concept requirements
4170 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
4171 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4172 // "the type returned by a _UnaryOperation"
4173 __typeof__(__unary_op(*__first))>)
4174 __glibcxx_requires_valid_range(__first, __last);
4175
4176 for (; __first != __last; ++__first, (void)++__result)
4177 *__result = __unary_op(*__first);
4178 return __result;
4179 }
4180
4181 /**
4182 * @brief Perform an operation on corresponding elements of two sequences.
4183 * @ingroup mutating_algorithms
4184 * @param __first1 An input iterator.
4185 * @param __last1 An input iterator.
4186 * @param __first2 An input iterator.
4187 * @param __result An output iterator.
4188 * @param __binary_op A binary operator.
4189 * @return An output iterator equal to @p result+(last-first).
4190 *
4191 * Applies the operator to the corresponding elements in the two
4192 * input ranges and assigns the results to successive elements of the
4193 * output sequence.
4194 * Evaluates @p
4195 * *(__result+N)=__binary_op(*(__first1+N),*(__first2+N)) for each
4196 * @c N in the range @p [0,__last1-__first1).
4197 *
4198 * @p binary_op must not alter either of its arguments.
4199 */
4200 template<typename _InputIterator1, typename _InputIterator2,
4201 typename _OutputIterator, typename _BinaryOperation>
4202 _OutputIterator
4203 transform(_InputIterator1 __first1, _InputIterator1 __last1,
4204 _InputIterator2 __first2, _OutputIterator __result,
4205 _BinaryOperation __binary_op)
4206 {
4207 // concept requirements
4208 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
4209 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
4210 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4211 // "the type returned by a _BinaryOperation"
4212 __typeof__(__binary_op(*__first1,*__first2))>)
4213 __glibcxx_requires_valid_range(__first1, __last1);
4214
4215 for (; __first1 != __last1; ++__first1, (void)++__first2, ++__result)
4216 *__result = __binary_op(*__first1, *__first2);
4217 return __result;
4218 }
4219
4220 /**
4221 * @brief Replace each occurrence of one value in a sequence with another
4222 * value.
4223 * @ingroup mutating_algorithms
4224 * @param __first A forward iterator.
4225 * @param __last A forward iterator.
4226 * @param __old_value The value to be replaced.
4227 * @param __new_value The replacement value.
4228 * @return replace() returns no value.
4229 *
4230 * For each iterator @c i in the range @p [__first,__last) if @c *i ==
4231 * @p __old_value then the assignment @c *i = @p __new_value is performed.
4232 */
4233 template<typename _ForwardIterator, typename _Tp>
4234 void
4235 replace(_ForwardIterator __first, _ForwardIterator __last,
4236 const _Tp& __old_value, const _Tp& __new_value)
4237 {
4238 // concept requirements
4239 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
4240 _ForwardIterator>)
4241 __glibcxx_function_requires(_EqualOpConcept<
4242 typename iterator_traits<_ForwardIterator>::value_type, _Tp>)
4243 __glibcxx_function_requires(_ConvertibleConcept<_Tp,
4244 typename iterator_traits<_ForwardIterator>::value_type>)
4245 __glibcxx_requires_valid_range(__first, __last);
4246
4247 for (; __first != __last; ++__first)
4248 if (*__first == __old_value)
4249 *__first = __new_value;
4250 }
4251
4252 /**
4253 * @brief Replace each value in a sequence for which a predicate returns
4254 * true with another value.
4255 * @ingroup mutating_algorithms
4256 * @param __first A forward iterator.
4257 * @param __last A forward iterator.
4258 * @param __pred A predicate.
4259 * @param __new_value The replacement value.
4260 * @return replace_if() returns no value.
4261 *
4262 * For each iterator @c i in the range @p [__first,__last) if @p __pred(*i)
4263 * is true then the assignment @c *i = @p __new_value is performed.
4264 */
4265 template<typename _ForwardIterator, typename _Predicate, typename _Tp>
4266 void
4267 replace_if(_ForwardIterator __first, _ForwardIterator __last,
4268 _Predicate __pred, const _Tp& __new_value)
4269 {
4270 // concept requirements
4271 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
4272 _ForwardIterator>)
4273 __glibcxx_function_requires(_ConvertibleConcept<_Tp,
4274 typename iterator_traits<_ForwardIterator>::value_type>)
4275 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
4276 typename iterator_traits<_ForwardIterator>::value_type>)
4277 __glibcxx_requires_valid_range(__first, __last);
4278
4279 for (; __first != __last; ++__first)
4280 if (__pred(*__first))
4281 *__first = __new_value;
4282 }
4283
4284 /**
4285 * @brief Assign the result of a function object to each value in a
4286 * sequence.
4287 * @ingroup mutating_algorithms
4288 * @param __first A forward iterator.
4289 * @param __last A forward iterator.
4290 * @param __gen A function object taking no arguments and returning
4291 * std::iterator_traits<_ForwardIterator>::value_type
4292 * @return generate() returns no value.
4293 *
4294 * Performs the assignment @c *i = @p __gen() for each @c i in the range
4295 * @p [__first,__last).
4296 */
4297 template<typename _ForwardIterator, typename _Generator>
4298 void
4299 generate(_ForwardIterator __first, _ForwardIterator __last,
4300 _Generator __gen)
4301 {
4302 // concept requirements
4303 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
4304 __glibcxx_function_requires(_GeneratorConcept<_Generator,
4305 typename iterator_traits<_ForwardIterator>::value_type>)
4306 __glibcxx_requires_valid_range(__first, __last);
4307
4308 for (; __first != __last; ++__first)
4309 *__first = __gen();
4310 }
4311
4312 /**
4313 * @brief Assign the result of a function object to each value in a
4314 * sequence.
4315 * @ingroup mutating_algorithms
4316 * @param __first A forward iterator.
4317 * @param __n The length of the sequence.
4318 * @param __gen A function object taking no arguments and returning
4319 * std::iterator_traits<_ForwardIterator>::value_type
4320 * @return The end of the sequence, @p __first+__n
4321 *
4322 * Performs the assignment @c *i = @p __gen() for each @c i in the range
4323 * @p [__first,__first+__n).
4324 *
4325 * _GLIBCXX_RESOLVE_LIB_DEFECTS
4326 * DR 865. More algorithms that throw away information
4327 */
4328 template<typename _OutputIterator, typename _Size, typename _Generator>
4329 _OutputIterator
4330 generate_n(_OutputIterator __first, _Size __n, _Generator __gen)
4331 {
4332 // concept requirements
4333 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4334 // "the type returned by a _Generator"
4335 __typeof__(__gen())>)
4336
4337 for (__decltype(__n + 0) __niter = __n;
4338 __niter > 0; --__niter, ++__first)
4339 *__first = __gen();
4340 return __first;
4341 }
4342
4343 /**
4344 * @brief Copy a sequence, removing consecutive duplicate values.
4345 * @ingroup mutating_algorithms
4346 * @param __first An input iterator.
4347 * @param __last An input iterator.
4348 * @param __result An output iterator.
4349 * @return An iterator designating the end of the resulting sequence.
4350 *
4351 * Copies each element in the range @p [__first,__last) to the range
4352 * beginning at @p __result, except that only the first element is copied
4353 * from groups of consecutive elements that compare equal.
4354 * unique_copy() is stable, so the relative order of elements that are
4355 * copied is unchanged.
4356 *
4357 * _GLIBCXX_RESOLVE_LIB_DEFECTS
4358 * DR 241. Does unique_copy() require CopyConstructible and Assignable?
4359 *
4360 * _GLIBCXX_RESOLVE_LIB_DEFECTS
4361 * DR 538. 241 again: Does unique_copy() require CopyConstructible and
4362 * Assignable?
4363 */
4364 template<typename _InputIterator, typename _OutputIterator>
4365 inline _OutputIterator
4366 unique_copy(_InputIterator __first, _InputIterator __last,
4367 _OutputIterator __result)
4368 {
4369 // concept requirements
4370 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
4371 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4372 typename iterator_traits<_InputIterator>::value_type>)
4373 __glibcxx_function_requires(_EqualityComparableConcept<
4374 typename iterator_traits<_InputIterator>::value_type>)
4375 __glibcxx_requires_valid_range(__first, __last);
4376
4377 if (__first == __last)
4378 return __result;
4379 return std::__unique_copy(__first, __last, __result,
4380 __gnu_cxx::__ops::__iter_equal_to_iter(),
4381 std::__iterator_category(__first),
4382 std::__iterator_category(__result));
4383 }
4384
4385 /**
4386 * @brief Copy a sequence, removing consecutive values using a predicate.
4387 * @ingroup mutating_algorithms
4388 * @param __first An input iterator.
4389 * @param __last An input iterator.
4390 * @param __result An output iterator.
4391 * @param __binary_pred A binary predicate.
4392 * @return An iterator designating the end of the resulting sequence.
4393 *
4394 * Copies each element in the range @p [__first,__last) to the range
4395 * beginning at @p __result, except that only the first element is copied
4396 * from groups of consecutive elements for which @p __binary_pred returns
4397 * true.
4398 * unique_copy() is stable, so the relative order of elements that are
4399 * copied is unchanged.
4400 *
4401 * _GLIBCXX_RESOLVE_LIB_DEFECTS
4402 * DR 241. Does unique_copy() require CopyConstructible and Assignable?
4403 */
4404 template<typename _InputIterator, typename _OutputIterator,
4405 typename _BinaryPredicate>
4406 inline _OutputIterator
4407 unique_copy(_InputIterator __first, _InputIterator __last,
4408 _OutputIterator __result,
4409 _BinaryPredicate __binary_pred)
4410 {
4411 // concept requirements -- predicates checked later
4412 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
4413 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4414 typename iterator_traits<_InputIterator>::value_type>)
4415 __glibcxx_requires_valid_range(__first, __last);
4416
4417 if (__first == __last)
4418 return __result;
4419 return std::__unique_copy(__first, __last, __result,
4420 __gnu_cxx::__ops::__iter_comp_iter(__binary_pred),
4421 std::__iterator_category(__first),
4422 std::__iterator_category(__result));
4423 }
4424
4425#if _GLIBCXX_HOSTED1
4426 /**
4427 * @brief Randomly shuffle the elements of a sequence.
4428 * @ingroup mutating_algorithms
4429 * @param __first A forward iterator.
4430 * @param __last A forward iterator.
4431 * @return Nothing.
4432 *
4433 * Reorder the elements in the range @p [__first,__last) using a random
4434 * distribution, so that every possible ordering of the sequence is
4435 * equally likely.
4436 */
4437 template<typename _RandomAccessIterator>
4438 inline void
4439 random_shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last)
4440 {
4441 // concept requirements
4442 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4443 _RandomAccessIterator>)
4444 __glibcxx_requires_valid_range(__first, __last);
4445
4446 if (__first != __last)
4447 for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)
4448 {
4449 // XXX rand() % N is not uniformly distributed
4450 _RandomAccessIterator __j = __first
4451 + std::rand() % ((__i - __first) + 1);
4452 if (__i != __j)
4453 std::iter_swap(__i, __j);
4454 }
4455 }
4456#endif
4457
4458 /**
4459 * @brief Shuffle the elements of a sequence using a random number
4460 * generator.
4461 * @ingroup mutating_algorithms
4462 * @param __first A forward iterator.
4463 * @param __last A forward iterator.
4464 * @param __rand The RNG functor or function.
4465 * @return Nothing.
4466 *
4467 * Reorders the elements in the range @p [__first,__last) using @p __rand to
4468 * provide a random distribution. Calling @p __rand(N) for a positive
4469 * integer @p N should return a randomly chosen integer from the
4470 * range [0,N).
4471 */
4472 template<typename _RandomAccessIterator, typename _RandomNumberGenerator>
4473 void
4474 random_shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last,
4475#if __cplusplus201402L >= 201103L
4476 _RandomNumberGenerator&& __rand)
4477#else
4478 _RandomNumberGenerator& __rand)
4479#endif
4480 {
4481 // concept requirements
4482 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4483 _RandomAccessIterator>)
4484 __glibcxx_requires_valid_range(__first, __last);
4485
4486 if (__first == __last)
4487 return;
4488 for (_RandomAccessIterator __i = __first + 1; __i != __last; ++__i)
4489 {
4490 _RandomAccessIterator __j = __first + __rand((__i - __first) + 1);
4491 if (__i != __j)
4492 std::iter_swap(__i, __j);
4493 }
4494 }
4495
4496
4497 /**
4498 * @brief Move elements for which a predicate is true to the beginning
4499 * of a sequence.
4500 * @ingroup mutating_algorithms
4501 * @param __first A forward iterator.
4502 * @param __last A forward iterator.
4503 * @param __pred A predicate functor.
4504 * @return An iterator @p middle such that @p __pred(i) is true for each
4505 * iterator @p i in the range @p [__first,middle) and false for each @p i
4506 * in the range @p [middle,__last).
4507 *
4508 * @p __pred must not modify its operand. @p partition() does not preserve
4509 * the relative ordering of elements in each group, use
4510 * @p stable_partition() if this is needed.
4511 */
4512 template<typename _ForwardIterator, typename _Predicate>
4513 inline _ForwardIterator
4514 partition(_ForwardIterator __first, _ForwardIterator __last,
4515 _Predicate __pred)
4516 {
4517 // concept requirements
4518 __glibcxx_function_requires(_Mutable_ForwardIteratorConcept<
4519 _ForwardIterator>)
4520 __glibcxx_function_requires(_UnaryPredicateConcept<_Predicate,
4521 typename iterator_traits<_ForwardIterator>::value_type>)
4522 __glibcxx_requires_valid_range(__first, __last);
4523
4524 return std::__partition(__first, __last, __pred,
4525 std::__iterator_category(__first));
4526 }
4527
4528
4529 /**
4530 * @brief Sort the smallest elements of a sequence.
4531 * @ingroup sorting_algorithms
4532 * @param __first An iterator.
4533 * @param __middle Another iterator.
4534 * @param __last Another iterator.
4535 * @return Nothing.
4536 *
4537 * Sorts the smallest @p (__middle-__first) elements in the range
4538 * @p [first,last) and moves them to the range @p [__first,__middle). The
4539 * order of the remaining elements in the range @p [__middle,__last) is
4540 * undefined.
4541 * After the sort if @e i and @e j are iterators in the range
4542 * @p [__first,__middle) such that i precedes j and @e k is an iterator in
4543 * the range @p [__middle,__last) then *j<*i and *k<*i are both false.
4544 */
4545 template<typename _RandomAccessIterator>
4546 inline void
4547 partial_sort(_RandomAccessIterator __first,
4548 _RandomAccessIterator __middle,
4549 _RandomAccessIterator __last)
4550 {
4551 // concept requirements
4552 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4553 _RandomAccessIterator>)
4554 __glibcxx_function_requires(_LessThanComparableConcept<
4555 typename iterator_traits<_RandomAccessIterator>::value_type>)
4556 __glibcxx_requires_valid_range(__first, __middle);
4557 __glibcxx_requires_valid_range(__middle, __last);
4558 __glibcxx_requires_irreflexive(__first, __last);
4559
4560 std::__partial_sort(__first, __middle, __last,
4561 __gnu_cxx::__ops::__iter_less_iter());
4562 }
4563
4564 /**
4565 * @brief Sort the smallest elements of a sequence using a predicate
4566 * for comparison.
4567 * @ingroup sorting_algorithms
4568 * @param __first An iterator.
4569 * @param __middle Another iterator.
4570 * @param __last Another iterator.
4571 * @param __comp A comparison functor.
4572 * @return Nothing.
4573 *
4574 * Sorts the smallest @p (__middle-__first) elements in the range
4575 * @p [__first,__last) and moves them to the range @p [__first,__middle). The
4576 * order of the remaining elements in the range @p [__middle,__last) is
4577 * undefined.
4578 * After the sort if @e i and @e j are iterators in the range
4579 * @p [__first,__middle) such that i precedes j and @e k is an iterator in
4580 * the range @p [__middle,__last) then @p *__comp(j,*i) and @p __comp(*k,*i)
4581 * are both false.
4582 */
4583 template<typename _RandomAccessIterator, typename _Compare>
4584 inline void
4585 partial_sort(_RandomAccessIterator __first,
4586 _RandomAccessIterator __middle,
4587 _RandomAccessIterator __last,
4588 _Compare __comp)
4589 {
4590 // concept requirements
4591 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4592 _RandomAccessIterator>)
4593 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
4594 typename iterator_traits<_RandomAccessIterator>::value_type,
4595 typename iterator_traits<_RandomAccessIterator>::value_type>)
4596 __glibcxx_requires_valid_range(__first, __middle);
4597 __glibcxx_requires_valid_range(__middle, __last);
4598 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
4599
4600 std::__partial_sort(__first, __middle, __last,
4601 __gnu_cxx::__ops::__iter_comp_iter(__comp));
4602 }
4603
4604 /**
4605 * @brief Sort a sequence just enough to find a particular position.
4606 * @ingroup sorting_algorithms
4607 * @param __first An iterator.
4608 * @param __nth Another iterator.
4609 * @param __last Another iterator.
4610 * @return Nothing.
4611 *
4612 * Rearranges the elements in the range @p [__first,__last) so that @p *__nth
4613 * is the same element that would have been in that position had the
4614 * whole sequence been sorted. The elements either side of @p *__nth are
4615 * not completely sorted, but for any iterator @e i in the range
4616 * @p [__first,__nth) and any iterator @e j in the range @p [__nth,__last) it
4617 * holds that *j < *i is false.
4618 */
4619 template<typename _RandomAccessIterator>
4620 inline void
4621 nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth,
4622 _RandomAccessIterator __last)
4623 {
4624 // concept requirements
4625 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4626 _RandomAccessIterator>)
4627 __glibcxx_function_requires(_LessThanComparableConcept<
4628 typename iterator_traits<_RandomAccessIterator>::value_type>)
4629 __glibcxx_requires_valid_range(__first, __nth);
4630 __glibcxx_requires_valid_range(__nth, __last);
4631 __glibcxx_requires_irreflexive(__first, __last);
4632
4633 if (__first == __last || __nth == __last)
4634 return;
4635
4636 std::__introselect(__first, __nth, __last,
4637 std::__lg(__last - __first) * 2,
4638 __gnu_cxx::__ops::__iter_less_iter());
4639 }
4640
4641 /**
4642 * @brief Sort a sequence just enough to find a particular position
4643 * using a predicate for comparison.
4644 * @ingroup sorting_algorithms
4645 * @param __first An iterator.
4646 * @param __nth Another iterator.
4647 * @param __last Another iterator.
4648 * @param __comp A comparison functor.
4649 * @return Nothing.
4650 *
4651 * Rearranges the elements in the range @p [__first,__last) so that @p *__nth
4652 * is the same element that would have been in that position had the
4653 * whole sequence been sorted. The elements either side of @p *__nth are
4654 * not completely sorted, but for any iterator @e i in the range
4655 * @p [__first,__nth) and any iterator @e j in the range @p [__nth,__last) it
4656 * holds that @p __comp(*j,*i) is false.
4657 */
4658 template<typename _RandomAccessIterator, typename _Compare>
4659 inline void
4660 nth_element(_RandomAccessIterator __first, _RandomAccessIterator __nth,
4661 _RandomAccessIterator __last, _Compare __comp)
4662 {
4663 // concept requirements
4664 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4665 _RandomAccessIterator>)
4666 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
4667 typename iterator_traits<_RandomAccessIterator>::value_type,
4668 typename iterator_traits<_RandomAccessIterator>::value_type>)
4669 __glibcxx_requires_valid_range(__first, __nth);
4670 __glibcxx_requires_valid_range(__nth, __last);
4671 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
4672
4673 if (__first == __last || __nth == __last)
4674 return;
4675
4676 std::__introselect(__first, __nth, __last,
4677 std::__lg(__last - __first) * 2,
4678 __gnu_cxx::__ops::__iter_comp_iter(__comp));
4679 }
4680
4681 /**
4682 * @brief Sort the elements of a sequence.
4683 * @ingroup sorting_algorithms
4684 * @param __first An iterator.
4685 * @param __last Another iterator.
4686 * @return Nothing.
4687 *
4688 * Sorts the elements in the range @p [__first,__last) in ascending order,
4689 * such that for each iterator @e i in the range @p [__first,__last-1),
4690 * *(i+1)<*i is false.
4691 *
4692 * The relative ordering of equivalent elements is not preserved, use
4693 * @p stable_sort() if this is needed.
4694 */
4695 template<typename _RandomAccessIterator>
4696 inline void
4697 sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
4698 {
4699 // concept requirements
4700 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4701 _RandomAccessIterator>)
4702 __glibcxx_function_requires(_LessThanComparableConcept<
4703 typename iterator_traits<_RandomAccessIterator>::value_type>)
4704 __glibcxx_requires_valid_range(__first, __last);
4705 __glibcxx_requires_irreflexive(__first, __last);
4706
4707 std::__sort(__first, __last, __gnu_cxx::__ops::__iter_less_iter());
4708 }
4709
4710 /**
4711 * @brief Sort the elements of a sequence using a predicate for comparison.
4712 * @ingroup sorting_algorithms
4713 * @param __first An iterator.
4714 * @param __last Another iterator.
4715 * @param __comp A comparison functor.
4716 * @return Nothing.
4717 *
4718 * Sorts the elements in the range @p [__first,__last) in ascending order,
4719 * such that @p __comp(*(i+1),*i) is false for every iterator @e i in the
4720 * range @p [__first,__last-1).
4721 *
4722 * The relative ordering of equivalent elements is not preserved, use
4723 * @p stable_sort() if this is needed.
4724 */
4725 template<typename _RandomAccessIterator, typename _Compare>
4726 inline void
4727 sort(_RandomAccessIterator __first, _RandomAccessIterator __last,
4728 _Compare __comp)
4729 {
4730 // concept requirements
4731 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4732 _RandomAccessIterator>)
4733 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
4734 typename iterator_traits<_RandomAccessIterator>::value_type,
4735 typename iterator_traits<_RandomAccessIterator>::value_type>)
4736 __glibcxx_requires_valid_range(__first, __last);
4737 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
4738
4739 std::__sort(__first, __last, __gnu_cxx::__ops::__iter_comp_iter(__comp));
4740 }
4741
4742 template<typename _InputIterator1, typename _InputIterator2,
4743 typename _OutputIterator, typename _Compare>
4744 _OutputIterator
4745 __merge(_InputIterator1 __first1, _InputIterator1 __last1,
4746 _InputIterator2 __first2, _InputIterator2 __last2,
4747 _OutputIterator __result, _Compare __comp)
4748 {
4749 while (__first1 != __last1 && __first2 != __last2)
4750 {
4751 if (__comp(__first2, __first1))
4752 {
4753 *__result = *__first2;
4754 ++__first2;
4755 }
4756 else
4757 {
4758 *__result = *__first1;
4759 ++__first1;
4760 }
4761 ++__result;
4762 }
4763 return std::copy(__first2, __last2,
4764 std::copy(__first1, __last1, __result));
4765 }
4766
4767 /**
4768 * @brief Merges two sorted ranges.
4769 * @ingroup sorting_algorithms
4770 * @param __first1 An iterator.
4771 * @param __first2 Another iterator.
4772 * @param __last1 Another iterator.
4773 * @param __last2 Another iterator.
4774 * @param __result An iterator pointing to the end of the merged range.
4775 * @return An iterator pointing to the first element <em>not less
4776 * than</em> @e val.
4777 *
4778 * Merges the ranges @p [__first1,__last1) and @p [__first2,__last2) into
4779 * the sorted range @p [__result, __result + (__last1-__first1) +
4780 * (__last2-__first2)). Both input ranges must be sorted, and the
4781 * output range must not overlap with either of the input ranges.
4782 * The sort is @e stable, that is, for equivalent elements in the
4783 * two ranges, elements from the first range will always come
4784 * before elements from the second.
4785 */
4786 template<typename _InputIterator1, typename _InputIterator2,
4787 typename _OutputIterator>
4788 inline _OutputIterator
4789 merge(_InputIterator1 __first1, _InputIterator1 __last1,
4790 _InputIterator2 __first2, _InputIterator2 __last2,
4791 _OutputIterator __result)
4792 {
4793 // concept requirements
4794 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
4795 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
4796 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4797 typename iterator_traits<_InputIterator1>::value_type>)
4798 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4799 typename iterator_traits<_InputIterator2>::value_type>)
4800 __glibcxx_function_requires(_LessThanOpConcept<
4801 typename iterator_traits<_InputIterator2>::value_type,
4802 typename iterator_traits<_InputIterator1>::value_type>)
4803 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
4804 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
4805 __glibcxx_requires_irreflexive2(__first1, __last1);
4806 __glibcxx_requires_irreflexive2(__first2, __last2);
4807
4808 return _GLIBCXX_STD_Astd::__merge(__first1, __last1,
4809 __first2, __last2, __result,
4810 __gnu_cxx::__ops::__iter_less_iter());
4811 }
4812
4813 /**
4814 * @brief Merges two sorted ranges.
4815 * @ingroup sorting_algorithms
4816 * @param __first1 An iterator.
4817 * @param __first2 Another iterator.
4818 * @param __last1 Another iterator.
4819 * @param __last2 Another iterator.
4820 * @param __result An iterator pointing to the end of the merged range.
4821 * @param __comp A functor to use for comparisons.
4822 * @return An iterator pointing to the first element "not less
4823 * than" @e val.
4824 *
4825 * Merges the ranges @p [__first1,__last1) and @p [__first2,__last2) into
4826 * the sorted range @p [__result, __result + (__last1-__first1) +
4827 * (__last2-__first2)). Both input ranges must be sorted, and the
4828 * output range must not overlap with either of the input ranges.
4829 * The sort is @e stable, that is, for equivalent elements in the
4830 * two ranges, elements from the first range will always come
4831 * before elements from the second.
4832 *
4833 * The comparison function should have the same effects on ordering as
4834 * the function used for the initial sort.
4835 */
4836 template<typename _InputIterator1, typename _InputIterator2,
4837 typename _OutputIterator, typename _Compare>
4838 inline _OutputIterator
4839 merge(_InputIterator1 __first1, _InputIterator1 __last1,
4840 _InputIterator2 __first2, _InputIterator2 __last2,
4841 _OutputIterator __result, _Compare __comp)
4842 {
4843 // concept requirements
4844 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
4845 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
4846 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4847 typename iterator_traits<_InputIterator1>::value_type>)
4848 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
4849 typename iterator_traits<_InputIterator2>::value_type>)
4850 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
4851 typename iterator_traits<_InputIterator2>::value_type,
4852 typename iterator_traits<_InputIterator1>::value_type>)
4853 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
4854 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
4855 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
4856 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
4857
4858 return _GLIBCXX_STD_Astd::__merge(__first1, __last1,
4859 __first2, __last2, __result,
4860 __gnu_cxx::__ops::__iter_comp_iter(__comp));
4861 }
4862
4863 template<typename _RandomAccessIterator, typename _Compare>
4864 inline void
4865 __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last,
4866 _Compare __comp)
4867 {
4868 typedef typename iterator_traits<_RandomAccessIterator>::value_type
4869 _ValueType;
4870 typedef typename iterator_traits<_RandomAccessIterator>::difference_type
4871 _DistanceType;
4872
4873 typedef _Temporary_buffer<_RandomAccessIterator, _ValueType> _TmpBuf;
4874 _TmpBuf __buf(__first, __last);
4875
4876 if (__buf.begin() == 0)
4877 std::__inplace_stable_sort(__first, __last, __comp);
4878 else
4879 std::__stable_sort_adaptive(__first, __last, __buf.begin(),
4880 _DistanceType(__buf.size()), __comp);
4881 }
4882
4883 /**
4884 * @brief Sort the elements of a sequence, preserving the relative order
4885 * of equivalent elements.
4886 * @ingroup sorting_algorithms
4887 * @param __first An iterator.
4888 * @param __last Another iterator.
4889 * @return Nothing.
4890 *
4891 * Sorts the elements in the range @p [__first,__last) in ascending order,
4892 * such that for each iterator @p i in the range @p [__first,__last-1),
4893 * @p *(i+1)<*i is false.
4894 *
4895 * The relative ordering of equivalent elements is preserved, so any two
4896 * elements @p x and @p y in the range @p [__first,__last) such that
4897 * @p x<y is false and @p y<x is false will have the same relative
4898 * ordering after calling @p stable_sort().
4899 */
4900 template<typename _RandomAccessIterator>
4901 inline void
4902 stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last)
4903 {
4904 // concept requirements
4905 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4906 _RandomAccessIterator>)
4907 __glibcxx_function_requires(_LessThanComparableConcept<
4908 typename iterator_traits<_RandomAccessIterator>::value_type>)
4909 __glibcxx_requires_valid_range(__first, __last);
4910 __glibcxx_requires_irreflexive(__first, __last);
4911
4912 _GLIBCXX_STD_Astd::__stable_sort(__first, __last,
4913 __gnu_cxx::__ops::__iter_less_iter());
4914 }
4915
4916 /**
4917 * @brief Sort the elements of a sequence using a predicate for comparison,
4918 * preserving the relative order of equivalent elements.
4919 * @ingroup sorting_algorithms
4920 * @param __first An iterator.
4921 * @param __last Another iterator.
4922 * @param __comp A comparison functor.
4923 * @return Nothing.
4924 *
4925 * Sorts the elements in the range @p [__first,__last) in ascending order,
4926 * such that for each iterator @p i in the range @p [__first,__last-1),
4927 * @p __comp(*(i+1),*i) is false.
4928 *
4929 * The relative ordering of equivalent elements is preserved, so any two
4930 * elements @p x and @p y in the range @p [__first,__last) such that
4931 * @p __comp(x,y) is false and @p __comp(y,x) is false will have the same
4932 * relative ordering after calling @p stable_sort().
4933 */
4934 template<typename _RandomAccessIterator, typename _Compare>
4935 inline void
4936 stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last,
4937 _Compare __comp)
4938 {
4939 // concept requirements
4940 __glibcxx_function_requires(_Mutable_RandomAccessIteratorConcept<
4941 _RandomAccessIterator>)
4942 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
4943 typename iterator_traits<_RandomAccessIterator>::value_type,
4944 typename iterator_traits<_RandomAccessIterator>::value_type>)
4945 __glibcxx_requires_valid_range(__first, __last);
4946 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
4947
4948 _GLIBCXX_STD_Astd::__stable_sort(__first, __last,
4949 __gnu_cxx::__ops::__iter_comp_iter(__comp));
4950 }
4951
4952 template<typename _InputIterator1, typename _InputIterator2,
4953 typename _OutputIterator,
4954 typename _Compare>
4955 _OutputIterator
4956 __set_union(_InputIterator1 __first1, _InputIterator1 __last1,
4957 _InputIterator2 __first2, _InputIterator2 __last2,
4958 _OutputIterator __result, _Compare __comp)
4959 {
4960 while (__first1 != __last1 && __first2 != __last2)
4961 {
4962 if (__comp(__first1, __first2))
4963 {
4964 *__result = *__first1;
4965 ++__first1;
4966 }
4967 else if (__comp(__first2, __first1))
4968 {
4969 *__result = *__first2;
4970 ++__first2;
4971 }
4972 else
4973 {
4974 *__result = *__first1;
4975 ++__first1;
4976 ++__first2;
4977 }
4978 ++__result;
4979 }
4980 return std::copy(__first2, __last2,
4981 std::copy(__first1, __last1, __result));
4982 }
4983
4984 /**
4985 * @brief Return the union of two sorted ranges.
4986 * @ingroup set_algorithms
4987 * @param __first1 Start of first range.
4988 * @param __last1 End of first range.
4989 * @param __first2 Start of second range.
4990 * @param __last2 End of second range.
4991 * @return End of the output range.
4992 * @ingroup set_algorithms
4993 *
4994 * This operation iterates over both ranges, copying elements present in
4995 * each range in order to the output range. Iterators increment for each
4996 * range. When the current element of one range is less than the other,
4997 * that element is copied and the iterator advanced. If an element is
4998 * contained in both ranges, the element from the first range is copied and
4999 * both ranges advance. The output range may not overlap either input
5000 * range.
5001 */
5002 template<typename _InputIterator1, typename _InputIterator2,
5003 typename _OutputIterator>
5004 inline _OutputIterator
5005 set_union(_InputIterator1 __first1, _InputIterator1 __last1,
5006 _InputIterator2 __first2, _InputIterator2 __last2,
5007 _OutputIterator __result)
5008 {
5009 // concept requirements
5010 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5011 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5012 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5013 typename iterator_traits<_InputIterator1>::value_type>)
5014 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5015 typename iterator_traits<_InputIterator2>::value_type>)
5016 __glibcxx_function_requires(_LessThanOpConcept<
5017 typename iterator_traits<_InputIterator1>::value_type,
5018 typename iterator_traits<_InputIterator2>::value_type>)
5019 __glibcxx_function_requires(_LessThanOpConcept<
5020 typename iterator_traits<_InputIterator2>::value_type,
5021 typename iterator_traits<_InputIterator1>::value_type>)
5022 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
5023 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
5024 __glibcxx_requires_irreflexive2(__first1, __last1);
5025 __glibcxx_requires_irreflexive2(__first2, __last2);
5026
5027 return _GLIBCXX_STD_Astd::__set_union(__first1, __last1,
5028 __first2, __last2, __result,
5029 __gnu_cxx::__ops::__iter_less_iter());
5030 }
5031
5032 /**
5033 * @brief Return the union of two sorted ranges using a comparison functor.
5034 * @ingroup set_algorithms
5035 * @param __first1 Start of first range.
5036 * @param __last1 End of first range.
5037 * @param __first2 Start of second range.
5038 * @param __last2 End of second range.
5039 * @param __comp The comparison functor.
5040 * @return End of the output range.
5041 * @ingroup set_algorithms
5042 *
5043 * This operation iterates over both ranges, copying elements present in
5044 * each range in order to the output range. Iterators increment for each
5045 * range. When the current element of one range is less than the other
5046 * according to @p __comp, that element is copied and the iterator advanced.
5047 * If an equivalent element according to @p __comp is contained in both
5048 * ranges, the element from the first range is copied and both ranges
5049 * advance. The output range may not overlap either input range.
5050 */
5051 template<typename _InputIterator1, typename _InputIterator2,
5052 typename _OutputIterator, typename _Compare>
5053 inline _OutputIterator
5054 set_union(_InputIterator1 __first1, _InputIterator1 __last1,
5055 _InputIterator2 __first2, _InputIterator2 __last2,
5056 _OutputIterator __result, _Compare __comp)
5057 {
5058 // concept requirements
5059 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5060 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5061 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5062 typename iterator_traits<_InputIterator1>::value_type>)
5063 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5064 typename iterator_traits<_InputIterator2>::value_type>)
5065 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5066 typename iterator_traits<_InputIterator1>::value_type,
5067 typename iterator_traits<_InputIterator2>::value_type>)
5068 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5069 typename iterator_traits<_InputIterator2>::value_type,
5070 typename iterator_traits<_InputIterator1>::value_type>)
5071 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
5072 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
5073 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
5074 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
5075
5076 return _GLIBCXX_STD_Astd::__set_union(__first1, __last1,
5077 __first2, __last2, __result,
5078 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5079 }
5080
5081 template<typename _InputIterator1, typename _InputIterator2,
5082 typename _OutputIterator,
5083 typename _Compare>
5084 _OutputIterator
5085 __set_intersection(_InputIterator1 __first1, _InputIterator1 __last1,
5086 _InputIterator2 __first2, _InputIterator2 __last2,
5087 _OutputIterator __result, _Compare __comp)
5088 {
5089 while (__first1 != __last1 && __first2 != __last2)
5090 if (__comp(__first1, __first2))
5091 ++__first1;
5092 else if (__comp(__first2, __first1))
5093 ++__first2;
5094 else
5095 {
5096 *__result = *__first1;
5097 ++__first1;
5098 ++__first2;
5099 ++__result;
5100 }
5101 return __result;
5102 }
5103
5104 /**
5105 * @brief Return the intersection of two sorted ranges.
5106 * @ingroup set_algorithms
5107 * @param __first1 Start of first range.
5108 * @param __last1 End of first range.
5109 * @param __first2 Start of second range.
5110 * @param __last2 End of second range.
5111 * @return End of the output range.
5112 * @ingroup set_algorithms
5113 *
5114 * This operation iterates over both ranges, copying elements present in
5115 * both ranges in order to the output range. Iterators increment for each
5116 * range. When the current element of one range is less than the other,
5117 * that iterator advances. If an element is contained in both ranges, the
5118 * element from the first range is copied and both ranges advance. The
5119 * output range may not overlap either input range.
5120 */
5121 template<typename _InputIterator1, typename _InputIterator2,
5122 typename _OutputIterator>
5123 inline _OutputIterator
5124 set_intersection(_InputIterator1 __first1, _InputIterator1 __last1,
5125 _InputIterator2 __first2, _InputIterator2 __last2,
5126 _OutputIterator __result)
5127 {
5128 // concept requirements
5129 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5130 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5131 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5132 typename iterator_traits<_InputIterator1>::value_type>)
5133 __glibcxx_function_requires(_LessThanOpConcept<
5134 typename iterator_traits<_InputIterator1>::value_type,
5135 typename iterator_traits<_InputIterator2>::value_type>)
5136 __glibcxx_function_requires(_LessThanOpConcept<
5137 typename iterator_traits<_InputIterator2>::value_type,
5138 typename iterator_traits<_InputIterator1>::value_type>)
5139 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
5140 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
5141 __glibcxx_requires_irreflexive2(__first1, __last1);
5142 __glibcxx_requires_irreflexive2(__first2, __last2);
5143
5144 return _GLIBCXX_STD_Astd::__set_intersection(__first1, __last1,
5145 __first2, __last2, __result,
5146 __gnu_cxx::__ops::__iter_less_iter());
5147 }
5148
5149 /**
5150 * @brief Return the intersection of two sorted ranges using comparison
5151 * functor.
5152 * @ingroup set_algorithms
5153 * @param __first1 Start of first range.
5154 * @param __last1 End of first range.
5155 * @param __first2 Start of second range.
5156 * @param __last2 End of second range.
5157 * @param __comp The comparison functor.
5158 * @return End of the output range.
5159 * @ingroup set_algorithms
5160 *
5161 * This operation iterates over both ranges, copying elements present in
5162 * both ranges in order to the output range. Iterators increment for each
5163 * range. When the current element of one range is less than the other
5164 * according to @p __comp, that iterator advances. If an element is
5165 * contained in both ranges according to @p __comp, the element from the
5166 * first range is copied and both ranges advance. The output range may not
5167 * overlap either input range.
5168 */
5169 template<typename _InputIterator1, typename _InputIterator2,
5170 typename _OutputIterator, typename _Compare>
5171 inline _OutputIterator
5172 set_intersection(_InputIterator1 __first1, _InputIterator1 __last1,
5173 _InputIterator2 __first2, _InputIterator2 __last2,
5174 _OutputIterator __result, _Compare __comp)
5175 {
5176 // concept requirements
5177 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5178 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5179 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5180 typename iterator_traits<_InputIterator1>::value_type>)
5181 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5182 typename iterator_traits<_InputIterator1>::value_type,
5183 typename iterator_traits<_InputIterator2>::value_type>)
5184 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5185 typename iterator_traits<_InputIterator2>::value_type,
5186 typename iterator_traits<_InputIterator1>::value_type>)
5187 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
5188 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
5189 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
5190 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
5191
5192 return _GLIBCXX_STD_Astd::__set_intersection(__first1, __last1,
5193 __first2, __last2, __result,
5194 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5195 }
5196
5197 template<typename _InputIterator1, typename _InputIterator2,
5198 typename _OutputIterator,
5199 typename _Compare>
5200 _OutputIterator
5201 __set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
5202 _InputIterator2 __first2, _InputIterator2 __last2,
5203 _OutputIterator __result, _Compare __comp)
5204 {
5205 while (__first1 != __last1 && __first2 != __last2)
5206 if (__comp(__first1, __first2))
5207 {
5208 *__result = *__first1;
5209 ++__first1;
5210 ++__result;
5211 }
5212 else if (__comp(__first2, __first1))
5213 ++__first2;
5214 else
5215 {
5216 ++__first1;
5217 ++__first2;
5218 }
5219 return std::copy(__first1, __last1, __result);
5220 }
5221
5222 /**
5223 * @brief Return the difference of two sorted ranges.
5224 * @ingroup set_algorithms
5225 * @param __first1 Start of first range.
5226 * @param __last1 End of first range.
5227 * @param __first2 Start of second range.
5228 * @param __last2 End of second range.
5229 * @return End of the output range.
5230 * @ingroup set_algorithms
5231 *
5232 * This operation iterates over both ranges, copying elements present in
5233 * the first range but not the second in order to the output range.
5234 * Iterators increment for each range. When the current element of the
5235 * first range is less than the second, that element is copied and the
5236 * iterator advances. If the current element of the second range is less,
5237 * the iterator advances, but no element is copied. If an element is
5238 * contained in both ranges, no elements are copied and both ranges
5239 * advance. The output range may not overlap either input range.
5240 */
5241 template<typename _InputIterator1, typename _InputIterator2,
5242 typename _OutputIterator>
5243 inline _OutputIterator
5244 set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
5245 _InputIterator2 __first2, _InputIterator2 __last2,
5246 _OutputIterator __result)
5247 {
5248 // concept requirements
5249 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5250 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5251 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5252 typename iterator_traits<_InputIterator1>::value_type>)
5253 __glibcxx_function_requires(_LessThanOpConcept<
5254 typename iterator_traits<_InputIterator1>::value_type,
5255 typename iterator_traits<_InputIterator2>::value_type>)
5256 __glibcxx_function_requires(_LessThanOpConcept<
5257 typename iterator_traits<_InputIterator2>::value_type,
5258 typename iterator_traits<_InputIterator1>::value_type>)
5259 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
5260 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
5261 __glibcxx_requires_irreflexive2(__first1, __last1);
5262 __glibcxx_requires_irreflexive2(__first2, __last2);
5263
5264 return _GLIBCXX_STD_Astd::__set_difference(__first1, __last1,
5265 __first2, __last2, __result,
5266 __gnu_cxx::__ops::__iter_less_iter());
5267 }
5268
5269 /**
5270 * @brief Return the difference of two sorted ranges using comparison
5271 * functor.
5272 * @ingroup set_algorithms
5273 * @param __first1 Start of first range.
5274 * @param __last1 End of first range.
5275 * @param __first2 Start of second range.
5276 * @param __last2 End of second range.
5277 * @param __comp The comparison functor.
5278 * @return End of the output range.
5279 * @ingroup set_algorithms
5280 *
5281 * This operation iterates over both ranges, copying elements present in
5282 * the first range but not the second in order to the output range.
5283 * Iterators increment for each range. When the current element of the
5284 * first range is less than the second according to @p __comp, that element
5285 * is copied and the iterator advances. If the current element of the
5286 * second range is less, no element is copied and the iterator advances.
5287 * If an element is contained in both ranges according to @p __comp, no
5288 * elements are copied and both ranges advance. The output range may not
5289 * overlap either input range.
5290 */
5291 template<typename _InputIterator1, typename _InputIterator2,
5292 typename _OutputIterator, typename _Compare>
5293 inline _OutputIterator
5294 set_difference(_InputIterator1 __first1, _InputIterator1 __last1,
5295 _InputIterator2 __first2, _InputIterator2 __last2,
5296 _OutputIterator __result, _Compare __comp)
5297 {
5298 // concept requirements
5299 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5300 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5301 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5302 typename iterator_traits<_InputIterator1>::value_type>)
5303 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5304 typename iterator_traits<_InputIterator1>::value_type,
5305 typename iterator_traits<_InputIterator2>::value_type>)
5306 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5307 typename iterator_traits<_InputIterator2>::value_type,
5308 typename iterator_traits<_InputIterator1>::value_type>)
5309 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
5310 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
5311 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
5312 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
5313
5314 return _GLIBCXX_STD_Astd::__set_difference(__first1, __last1,
5315 __first2, __last2, __result,
5316 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5317 }
5318
5319 template<typename _InputIterator1, typename _InputIterator2,
5320 typename _OutputIterator,
5321 typename _Compare>
5322 _OutputIterator
5323 __set_symmetric_difference(_InputIterator1 __first1,
5324 _InputIterator1 __last1,
5325 _InputIterator2 __first2,
5326 _InputIterator2 __last2,
5327 _OutputIterator __result,
5328 _Compare __comp)
5329 {
5330 while (__first1 != __last1 && __first2 != __last2)
5331 if (__comp(__first1, __first2))
5332 {
5333 *__result = *__first1;
5334 ++__first1;
5335 ++__result;
5336 }
5337 else if (__comp(__first2, __first1))
5338 {
5339 *__result = *__first2;
5340 ++__first2;
5341 ++__result;
5342 }
5343 else
5344 {
5345 ++__first1;
5346 ++__first2;
5347 }
5348 return std::copy(__first2, __last2,
5349 std::copy(__first1, __last1, __result));
5350 }
5351
5352 /**
5353 * @brief Return the symmetric difference of two sorted ranges.
5354 * @ingroup set_algorithms
5355 * @param __first1 Start of first range.
5356 * @param __last1 End of first range.
5357 * @param __first2 Start of second range.
5358 * @param __last2 End of second range.
5359 * @return End of the output range.
5360 * @ingroup set_algorithms
5361 *
5362 * This operation iterates over both ranges, copying elements present in
5363 * one range but not the other in order to the output range. Iterators
5364 * increment for each range. When the current element of one range is less
5365 * than the other, that element is copied and the iterator advances. If an
5366 * element is contained in both ranges, no elements are copied and both
5367 * ranges advance. The output range may not overlap either input range.
5368 */
5369 template<typename _InputIterator1, typename _InputIterator2,
5370 typename _OutputIterator>
5371 inline _OutputIterator
5372 set_symmetric_difference(_InputIterator1 __first1, _InputIterator1 __last1,
5373 _InputIterator2 __first2, _InputIterator2 __last2,
5374 _OutputIterator __result)
5375 {
5376 // concept requirements
5377 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5378 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5379 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5380 typename iterator_traits<_InputIterator1>::value_type>)
5381 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5382 typename iterator_traits<_InputIterator2>::value_type>)
5383 __glibcxx_function_requires(_LessThanOpConcept<
5384 typename iterator_traits<_InputIterator1>::value_type,
5385 typename iterator_traits<_InputIterator2>::value_type>)
5386 __glibcxx_function_requires(_LessThanOpConcept<
5387 typename iterator_traits<_InputIterator2>::value_type,
5388 typename iterator_traits<_InputIterator1>::value_type>)
5389 __glibcxx_requires_sorted_set(__first1, __last1, __first2);
5390 __glibcxx_requires_sorted_set(__first2, __last2, __first1);
5391 __glibcxx_requires_irreflexive2(__first1, __last1);
5392 __glibcxx_requires_irreflexive2(__first2, __last2);
5393
5394 return _GLIBCXX_STD_Astd::__set_symmetric_difference(__first1, __last1,
5395 __first2, __last2, __result,
5396 __gnu_cxx::__ops::__iter_less_iter());
5397 }
5398
5399 /**
5400 * @brief Return the symmetric difference of two sorted ranges using
5401 * comparison functor.
5402 * @ingroup set_algorithms
5403 * @param __first1 Start of first range.
5404 * @param __last1 End of first range.
5405 * @param __first2 Start of second range.
5406 * @param __last2 End of second range.
5407 * @param __comp The comparison functor.
5408 * @return End of the output range.
5409 * @ingroup set_algorithms
5410 *
5411 * This operation iterates over both ranges, copying elements present in
5412 * one range but not the other in order to the output range. Iterators
5413 * increment for each range. When the current element of one range is less
5414 * than the other according to @p comp, that element is copied and the
5415 * iterator advances. If an element is contained in both ranges according
5416 * to @p __comp, no elements are copied and both ranges advance. The output
5417 * range may not overlap either input range.
5418 */
5419 template<typename _InputIterator1, typename _InputIterator2,
5420 typename _OutputIterator, typename _Compare>
5421 inline _OutputIterator
5422 set_symmetric_difference(_InputIterator1 __first1, _InputIterator1 __last1,
5423 _InputIterator2 __first2, _InputIterator2 __last2,
5424 _OutputIterator __result,
5425 _Compare __comp)
5426 {
5427 // concept requirements
5428 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator1>)
5429 __glibcxx_function_requires(_InputIteratorConcept<_InputIterator2>)
5430 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5431 typename iterator_traits<_InputIterator1>::value_type>)
5432 __glibcxx_function_requires(_OutputIteratorConcept<_OutputIterator,
5433 typename iterator_traits<_InputIterator2>::value_type>)
5434 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5435 typename iterator_traits<_InputIterator1>::value_type,
5436 typename iterator_traits<_InputIterator2>::value_type>)
5437 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5438 typename iterator_traits<_InputIterator2>::value_type,
5439 typename iterator_traits<_InputIterator1>::value_type>)
5440 __glibcxx_requires_sorted_set_pred(__first1, __last1, __first2, __comp);
5441 __glibcxx_requires_sorted_set_pred(__first2, __last2, __first1, __comp);
5442 __glibcxx_requires_irreflexive_pred2(__first1, __last1, __comp);
5443 __glibcxx_requires_irreflexive_pred2(__first2, __last2, __comp);
5444
5445 return _GLIBCXX_STD_Astd::__set_symmetric_difference(__first1, __last1,
5446 __first2, __last2, __result,
5447 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5448 }
5449
5450 template<typename _ForwardIterator, typename _Compare>
5451 _GLIBCXX14_CONSTEXPRconstexpr
5452 _ForwardIterator
5453 __min_element(_ForwardIterator __first, _ForwardIterator __last,
5454 _Compare __comp)
5455 {
5456 if (__first == __last)
5457 return __first;
5458 _ForwardIterator __result = __first;
5459 while (++__first != __last)
5460 if (__comp(__first, __result))
5461 __result = __first;
5462 return __result;
5463 }
5464
5465 /**
5466 * @brief Return the minimum element in a range.
5467 * @ingroup sorting_algorithms
5468 * @param __first Start of range.
5469 * @param __last End of range.
5470 * @return Iterator referencing the first instance of the smallest value.
5471 */
5472 template<typename _ForwardIterator>
5473 _GLIBCXX14_CONSTEXPRconstexpr
5474 _ForwardIterator
5475 inline min_element(_ForwardIterator __first, _ForwardIterator __last)
5476 {
5477 // concept requirements
5478 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
5479 __glibcxx_function_requires(_LessThanComparableConcept<
5480 typename iterator_traits<_ForwardIterator>::value_type>)
5481 __glibcxx_requires_valid_range(__first, __last);
5482 __glibcxx_requires_irreflexive(__first, __last);
5483
5484 return _GLIBCXX_STD_Astd::__min_element(__first, __last,
5485 __gnu_cxx::__ops::__iter_less_iter());
5486 }
5487
5488 /**
5489 * @brief Return the minimum element in a range using comparison functor.
5490 * @ingroup sorting_algorithms
5491 * @param __first Start of range.
5492 * @param __last End of range.
5493 * @param __comp Comparison functor.
5494 * @return Iterator referencing the first instance of the smallest value
5495 * according to __comp.
5496 */
5497 template<typename _ForwardIterator, typename _Compare>
5498 _GLIBCXX14_CONSTEXPRconstexpr
5499 inline _ForwardIterator
5500 min_element(_ForwardIterator __first, _ForwardIterator __last,
5501 _Compare __comp)
5502 {
5503 // concept requirements
5504 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
5505 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5506 typename iterator_traits<_ForwardIterator>::value_type,
5507 typename iterator_traits<_ForwardIterator>::value_type>)
5508 __glibcxx_requires_valid_range(__first, __last);
5509 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
5510
5511 return _GLIBCXX_STD_Astd::__min_element(__first, __last,
5512 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5513 }
5514
5515 template<typename _ForwardIterator, typename _Compare>
5516 _GLIBCXX14_CONSTEXPRconstexpr
5517 _ForwardIterator
5518 __max_element(_ForwardIterator __first, _ForwardIterator __last,
5519 _Compare __comp)
5520 {
5521 if (__first == __last) return __first;
5522 _ForwardIterator __result = __first;
5523 while (++__first != __last)
5524 if (__comp(__result, __first))
5525 __result = __first;
5526 return __result;
5527 }
5528
5529 /**
5530 * @brief Return the maximum element in a range.
5531 * @ingroup sorting_algorithms
5532 * @param __first Start of range.
5533 * @param __last End of range.
5534 * @return Iterator referencing the first instance of the largest value.
5535 */
5536 template<typename _ForwardIterator>
5537 _GLIBCXX14_CONSTEXPRconstexpr
5538 inline _ForwardIterator
5539 max_element(_ForwardIterator __first, _ForwardIterator __last)
5540 {
5541 // concept requirements
5542 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
5543 __glibcxx_function_requires(_LessThanComparableConcept<
5544 typename iterator_traits<_ForwardIterator>::value_type>)
5545 __glibcxx_requires_valid_range(__first, __last);
5546 __glibcxx_requires_irreflexive(__first, __last);
5547
5548 return _GLIBCXX_STD_Astd::__max_element(__first, __last,
5549 __gnu_cxx::__ops::__iter_less_iter());
5550 }
5551
5552 /**
5553 * @brief Return the maximum element in a range using comparison functor.
5554 * @ingroup sorting_algorithms
5555 * @param __first Start of range.
5556 * @param __last End of range.
5557 * @param __comp Comparison functor.
5558 * @return Iterator referencing the first instance of the largest value
5559 * according to __comp.
5560 */
5561 template<typename _ForwardIterator, typename _Compare>
5562 _GLIBCXX14_CONSTEXPRconstexpr
5563 inline _ForwardIterator
5564 max_element(_ForwardIterator __first, _ForwardIterator __last,
5565 _Compare __comp)
5566 {
5567 // concept requirements
5568 __glibcxx_function_requires(_ForwardIteratorConcept<_ForwardIterator>)
5569 __glibcxx_function_requires(_BinaryPredicateConcept<_Compare,
5570 typename iterator_traits<_ForwardIterator>::value_type,
5571 typename iterator_traits<_ForwardIterator>::value_type>)
5572 __glibcxx_requires_valid_range(__first, __last);
5573 __glibcxx_requires_irreflexive_pred(__first, __last, __comp);
5574
5575 return _GLIBCXX_STD_Astd::__max_element(__first, __last,
5576 __gnu_cxx::__ops::__iter_comp_iter(__comp));
5577 }
5578
5579_GLIBCXX_END_NAMESPACE_ALGO
5580} // namespace std
5581
5582#endif /* _STL_ALGO_H */

/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h

1//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the SmallVector class.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_ADT_SMALLVECTOR_H
14#define LLVM_ADT_SMALLVECTOR_H
15
16#include "llvm/ADT/iterator_range.h"
17#include "llvm/Support/AlignOf.h"
18#include "llvm/Support/Compiler.h"
19#include "llvm/Support/MathExtras.h"
20#include "llvm/Support/MemAlloc.h"
21#include "llvm/Support/type_traits.h"
22#include "llvm/Support/ErrorHandling.h"
23#include <algorithm>
24#include <cassert>
25#include <cstddef>
26#include <cstdlib>
27#include <cstring>
28#include <initializer_list>
29#include <iterator>
30#include <memory>
31#include <new>
32#include <type_traits>
33#include <utility>
34
35namespace llvm {
36
37/// This is all the non-templated stuff common to all SmallVectors.
38class SmallVectorBase {
39protected:
40 void *BeginX;
41 unsigned Size = 0, Capacity;
42
43 SmallVectorBase() = delete;
44 SmallVectorBase(void *FirstEl, size_t TotalCapacity)
45 : BeginX(FirstEl), Capacity(TotalCapacity) {}
46
47 /// This is an implementation of the grow() method which only works
48 /// on POD-like data types and is out of line to reduce code duplication.
49 void grow_pod(void *FirstEl, size_t MinCapacity, size_t TSize);
50
51public:
52 size_t size() const { return Size; }
56
Returning zero
53 size_t capacity() const { return Capacity; }
54
55 LLVM_NODISCARD[[clang::warn_unused_result]] bool empty() const { return !Size; }
56
57 /// Set the array size to \p N, which the current array must have enough
58 /// capacity for.
59 ///
60 /// This does not construct or destroy any elements in the vector.
61 ///
62 /// Clients can use this in conjunction with capacity() to write past the end
63 /// of the buffer when they know that more elements are available, and only
64 /// update the size later. This avoids the cost of value initializing elements
65 /// which will only be overwritten.
66 void set_size(size_t N) {
67 assert(N <= capacity())((N <= capacity()) ? static_cast<void> (0) : __assert_fail
("N <= capacity()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 67, __PRETTY_FUNCTION__))
;
68 Size = N;
69 }
70};
71
72/// Figure out the offset of the first element.
73template <class T, typename = void> struct SmallVectorAlignmentAndSize {
74 AlignedCharArrayUnion<SmallVectorBase> Base;
75 AlignedCharArrayUnion<T> FirstEl;
76};
77
78/// This is the part of SmallVectorTemplateBase which does not depend on whether
79/// the type T is a POD. The extra dummy template argument is used by ArrayRef
80/// to avoid unnecessarily requiring T to be complete.
81template <typename T, typename = void>
82class SmallVectorTemplateCommon : public SmallVectorBase {
83 /// Find the address of the first element. For this pointer math to be valid
84 /// with small-size of 0 for T with lots of alignment, it's important that
85 /// SmallVectorStorage is properly-aligned even for small-size of 0.
86 void *getFirstEl() const {
87 return const_cast<void *>(reinterpret_cast<const void *>(
88 reinterpret_cast<const char *>(this) +
89 offsetof(SmallVectorAlignmentAndSize<T>, FirstEl)__builtin_offsetof(SmallVectorAlignmentAndSize<T>, FirstEl
)
));
90 }
91 // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
92
93protected:
94 SmallVectorTemplateCommon(size_t Size)
95 : SmallVectorBase(getFirstEl(), Size) {}
96
97 void grow_pod(size_t MinCapacity, size_t TSize) {
98 SmallVectorBase::grow_pod(getFirstEl(), MinCapacity, TSize);
99 }
100
101 /// Return true if this is a smallvector which has not had dynamic
102 /// memory allocated for it.
103 bool isSmall() const { return BeginX == getFirstEl(); }
104
105 /// Put this vector in a state of being small.
106 void resetToSmall() {
107 BeginX = getFirstEl();
108 Size = Capacity = 0; // FIXME: Setting Capacity to 0 is suspect.
109 }
110
111public:
112 using size_type = size_t;
113 using difference_type = ptrdiff_t;
114 using value_type = T;
115 using iterator = T *;
116 using const_iterator = const T *;
117
118 using const_reverse_iterator = std::reverse_iterator<const_iterator>;
119 using reverse_iterator = std::reverse_iterator<iterator>;
120
121 using reference = T &;
122 using const_reference = const T &;
123 using pointer = T *;
124 using const_pointer = const T *;
125
126 // forward iterator creation methods.
127 iterator begin() { return (iterator)this->BeginX; }
128 const_iterator begin() const { return (const_iterator)this->BeginX; }
129 iterator end() { return begin() + size(); }
130 const_iterator end() const { return begin() + size(); }
131
132 // reverse iterator creation methods.
133 reverse_iterator rbegin() { return reverse_iterator(end()); }
134 const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
135 reverse_iterator rend() { return reverse_iterator(begin()); }
136 const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
137
138 size_type size_in_bytes() const { return size() * sizeof(T); }
139 size_type max_size() const { return size_type(-1) / sizeof(T); }
140
141 size_t capacity_in_bytes() const { return capacity() * sizeof(T); }
142
143 /// Return a pointer to the vector's buffer, even if empty().
144 pointer data() { return pointer(begin()); }
145 /// Return a pointer to the vector's buffer, even if empty().
146 const_pointer data() const { return const_pointer(begin()); }
147
148 reference operator[](size_type idx) {
149 assert(idx < size())((idx < size()) ? static_cast<void> (0) : __assert_fail
("idx < size()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 149, __PRETTY_FUNCTION__))
;
150 return begin()[idx];
151 }
152 const_reference operator[](size_type idx) const {
153 assert(idx < size())((idx < size()) ? static_cast<void> (0) : __assert_fail
("idx < size()", "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 153, __PRETTY_FUNCTION__))
;
154 return begin()[idx];
155 }
156
157 reference front() {
158 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 158, __PRETTY_FUNCTION__))
;
159 return begin()[0];
160 }
161 const_reference front() const {
162 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 162, __PRETTY_FUNCTION__))
;
163 return begin()[0];
164 }
165
166 reference back() {
167 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 167, __PRETTY_FUNCTION__))
;
168 return end()[-1];
169 }
170 const_reference back() const {
171 assert(!empty())((!empty()) ? static_cast<void> (0) : __assert_fail ("!empty()"
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 171, __PRETTY_FUNCTION__))
;
172 return end()[-1];
173 }
174};
175
176/// SmallVectorTemplateBase<TriviallyCopyable = false> - This is where we put method
177/// implementations that are designed to work with non-POD-like T's.
178template <typename T, bool = is_trivially_copyable<T>::value>
179class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
180protected:
181 SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
182
183 static void destroy_range(T *S, T *E) {
184 while (S != E) {
185 --E;
186 E->~T();
187 }
188 }
189
190 /// Move the range [I, E) into the uninitialized memory starting with "Dest",
191 /// constructing elements as needed.
192 template<typename It1, typename It2>
193 static void uninitialized_move(It1 I, It1 E, It2 Dest) {
194 std::uninitialized_copy(std::make_move_iterator(I),
195 std::make_move_iterator(E), Dest);
196 }
197
198 /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
199 /// constructing elements as needed.
200 template<typename It1, typename It2>
201 static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
202 std::uninitialized_copy(I, E, Dest);
203 }
204
205 /// Grow the allocated memory (without initializing new elements), doubling
206 /// the size of the allocated memory. Guarantees space for at least one more
207 /// element, or MinSize more elements if specified.
208 void grow(size_t MinSize = 0);
209
210public:
211 void push_back(const T &Elt) {
212 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
213 this->grow();
214 ::new ((void*) this->end()) T(Elt);
215 this->set_size(this->size() + 1);
216 }
217
218 void push_back(T &&Elt) {
219 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
220 this->grow();
221 ::new ((void*) this->end()) T(::std::move(Elt));
222 this->set_size(this->size() + 1);
223 }
224
225 void pop_back() {
226 this->set_size(this->size() - 1);
227 this->end()->~T();
228 }
229};
230
231// Define this out-of-line to dissuade the C++ compiler from inlining it.
232template <typename T, bool TriviallyCopyable>
233void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
234 if (MinSize > UINT32_MAX(4294967295U))
235 report_bad_alloc_error("SmallVector capacity overflow during allocation");
236
237 // Always grow, even from zero.
238 size_t NewCapacity = size_t(NextPowerOf2(this->capacity() + 2));
239 NewCapacity = std::min(std::max(NewCapacity, MinSize), size_t(UINT32_MAX(4294967295U)));
240 T *NewElts = static_cast<T*>(llvm::safe_malloc(NewCapacity*sizeof(T)));
241
242 // Move the elements over.
243 this->uninitialized_move(this->begin(), this->end(), NewElts);
244
245 // Destroy the original elements.
246 destroy_range(this->begin(), this->end());
247
248 // If this wasn't grown from the inline copy, deallocate the old space.
249 if (!this->isSmall())
250 free(this->begin());
251
252 this->BeginX = NewElts;
253 this->Capacity = NewCapacity;
254}
255
256/// SmallVectorTemplateBase<TriviallyCopyable = true> - This is where we put
257/// method implementations that are designed to work with POD-like T's.
258template <typename T>
259class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
260protected:
261 SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
262
263 // No need to do a destroy loop for POD's.
264 static void destroy_range(T *, T *) {}
265
266 /// Move the range [I, E) onto the uninitialized memory
267 /// starting with "Dest", constructing elements into it as needed.
268 template<typename It1, typename It2>
269 static void uninitialized_move(It1 I, It1 E, It2 Dest) {
270 // Just do a copy.
271 uninitialized_copy(I, E, Dest);
272 }
273
274 /// Copy the range [I, E) onto the uninitialized memory
275 /// starting with "Dest", constructing elements into it as needed.
276 template<typename It1, typename It2>
277 static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
278 // Arbitrary iterator types; just use the basic implementation.
279 std::uninitialized_copy(I, E, Dest);
280 }
281
282 /// Copy the range [I, E) onto the uninitialized memory
283 /// starting with "Dest", constructing elements into it as needed.
284 template <typename T1, typename T2>
285 static void uninitialized_copy(
286 T1 *I, T1 *E, T2 *Dest,
287 std::enable_if_t<std::is_same<typename std::remove_const<T1>::type,
288 T2>::value> * = nullptr) {
289 // Use memcpy for PODs iterated by pointers (which includes SmallVector
290 // iterators): std::uninitialized_copy optimizes to memmove, but we can
291 // use memcpy here. Note that I and E are iterators and thus might be
292 // invalid for memcpy if they are equal.
293 if (I != E)
294 memcpy(reinterpret_cast<void *>(Dest), I, (E - I) * sizeof(T));
295 }
296
297 /// Double the size of the allocated memory, guaranteeing space for at
298 /// least one more element or MinSize if specified.
299 void grow(size_t MinSize = 0) { this->grow_pod(MinSize, sizeof(T)); }
300
301public:
302 void push_back(const T &Elt) {
303 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
304 this->grow();
305 memcpy(reinterpret_cast<void *>(this->end()), &Elt, sizeof(T));
306 this->set_size(this->size() + 1);
307 }
308
309 void pop_back() { this->set_size(this->size() - 1); }
310};
311
312/// This class consists of common code factored out of the SmallVector class to
313/// reduce code duplication based on the SmallVector 'N' template parameter.
314template <typename T>
315class SmallVectorImpl : public SmallVectorTemplateBase<T> {
316 using SuperClass = SmallVectorTemplateBase<T>;
317
318public:
319 using iterator = typename SuperClass::iterator;
320 using const_iterator = typename SuperClass::const_iterator;
321 using reference = typename SuperClass::reference;
322 using size_type = typename SuperClass::size_type;
323
324protected:
325 // Default ctor - Initialize to empty.
326 explicit SmallVectorImpl(unsigned N)
327 : SmallVectorTemplateBase<T>(N) {}
328
329public:
330 SmallVectorImpl(const SmallVectorImpl &) = delete;
331
332 ~SmallVectorImpl() {
333 // Subclass has already destructed this vector's elements.
334 // If this wasn't grown from the inline copy, deallocate the old space.
335 if (!this->isSmall())
336 free(this->begin());
337 }
338
339 void clear() {
340 this->destroy_range(this->begin(), this->end());
341 this->Size = 0;
342 }
343
344 void resize(size_type N) {
345 if (N < this->size()) {
346 this->destroy_range(this->begin()+N, this->end());
347 this->set_size(N);
348 } else if (N > this->size()) {
349 if (this->capacity() < N)
350 this->grow(N);
351 for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
352 new (&*I) T();
353 this->set_size(N);
354 }
355 }
356
357 void resize(size_type N, const T &NV) {
358 if (N < this->size()) {
359 this->destroy_range(this->begin()+N, this->end());
360 this->set_size(N);
361 } else if (N > this->size()) {
362 if (this->capacity() < N)
363 this->grow(N);
364 std::uninitialized_fill(this->end(), this->begin()+N, NV);
365 this->set_size(N);
366 }
367 }
368
369 void reserve(size_type N) {
370 if (this->capacity() < N)
371 this->grow(N);
372 }
373
374 LLVM_NODISCARD[[clang::warn_unused_result]] T pop_back_val() {
375 T Result = ::std::move(this->back());
376 this->pop_back();
377 return Result;
378 }
379
380 void swap(SmallVectorImpl &RHS);
381
382 /// Add the specified range to the end of the SmallVector.
383 template <typename in_iter,
384 typename = std::enable_if_t<std::is_convertible<
385 typename std::iterator_traits<in_iter>::iterator_category,
386 std::input_iterator_tag>::value>>
387 void append(in_iter in_start, in_iter in_end) {
388 size_type NumInputs = std::distance(in_start, in_end);
389 if (NumInputs > this->capacity() - this->size())
390 this->grow(this->size()+NumInputs);
391
392 this->uninitialized_copy(in_start, in_end, this->end());
393 this->set_size(this->size() + NumInputs);
394 }
395
396 /// Append \p NumInputs copies of \p Elt to the end.
397 void append(size_type NumInputs, const T &Elt) {
398 if (NumInputs > this->capacity() - this->size())
399 this->grow(this->size()+NumInputs);
400
401 std::uninitialized_fill_n(this->end(), NumInputs, Elt);
402 this->set_size(this->size() + NumInputs);
403 }
404
405 void append(std::initializer_list<T> IL) {
406 append(IL.begin(), IL.end());
407 }
408
409 // FIXME: Consider assigning over existing elements, rather than clearing &
410 // re-initializing them - for all assign(...) variants.
411
412 void assign(size_type NumElts, const T &Elt) {
413 clear();
414 if (this->capacity() < NumElts)
415 this->grow(NumElts);
416 this->set_size(NumElts);
417 std::uninitialized_fill(this->begin(), this->end(), Elt);
418 }
419
420 template <typename in_iter,
421 typename = std::enable_if_t<std::is_convertible<
422 typename std::iterator_traits<in_iter>::iterator_category,
423 std::input_iterator_tag>::value>>
424 void assign(in_iter in_start, in_iter in_end) {
425 clear();
426 append(in_start, in_end);
427 }
428
429 void assign(std::initializer_list<T> IL) {
430 clear();
431 append(IL);
432 }
433
434 iterator erase(const_iterator CI) {
435 // Just cast away constness because this is a non-const member function.
436 iterator I = const_cast<iterator>(CI);
437
438 assert(I >= this->begin() && "Iterator to erase is out of bounds.")((I >= this->begin() && "Iterator to erase is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Iterator to erase is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 438, __PRETTY_FUNCTION__))
;
439 assert(I < this->end() && "Erasing at past-the-end iterator.")((I < this->end() && "Erasing at past-the-end iterator."
) ? static_cast<void> (0) : __assert_fail ("I < this->end() && \"Erasing at past-the-end iterator.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 439, __PRETTY_FUNCTION__))
;
440
441 iterator N = I;
442 // Shift all elts down one.
443 std::move(I+1, this->end(), I);
444 // Drop the last elt.
445 this->pop_back();
446 return(N);
447 }
448
449 iterator erase(const_iterator CS, const_iterator CE) {
450 // Just cast away constness because this is a non-const member function.
451 iterator S = const_cast<iterator>(CS);
452 iterator E = const_cast<iterator>(CE);
453
454 assert(S >= this->begin() && "Range to erase is out of bounds.")((S >= this->begin() && "Range to erase is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("S >= this->begin() && \"Range to erase is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 454, __PRETTY_FUNCTION__))
;
455 assert(S <= E && "Trying to erase invalid range.")((S <= E && "Trying to erase invalid range.") ? static_cast
<void> (0) : __assert_fail ("S <= E && \"Trying to erase invalid range.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 455, __PRETTY_FUNCTION__))
;
456 assert(E <= this->end() && "Trying to erase past the end.")((E <= this->end() && "Trying to erase past the end."
) ? static_cast<void> (0) : __assert_fail ("E <= this->end() && \"Trying to erase past the end.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 456, __PRETTY_FUNCTION__))
;
457
458 iterator N = S;
459 // Shift all elts down.
460 iterator I = std::move(E, this->end(), S);
461 // Drop the last elts.
462 this->destroy_range(I, this->end());
463 this->set_size(I - this->begin());
464 return(N);
465 }
466
467 iterator insert(iterator I, T &&Elt) {
468 if (I == this->end()) { // Important special case for empty vector.
469 this->push_back(::std::move(Elt));
470 return this->end()-1;
471 }
472
473 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 473, __PRETTY_FUNCTION__))
;
474 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 474, __PRETTY_FUNCTION__))
;
475
476 if (this->size() >= this->capacity()) {
477 size_t EltNo = I-this->begin();
478 this->grow();
479 I = this->begin()+EltNo;
480 }
481
482 ::new ((void*) this->end()) T(::std::move(this->back()));
483 // Push everything else over.
484 std::move_backward(I, this->end()-1, this->end());
485 this->set_size(this->size() + 1);
486
487 // If we just moved the element we're inserting, be sure to update
488 // the reference.
489 T *EltPtr = &Elt;
490 if (I <= EltPtr && EltPtr < this->end())
491 ++EltPtr;
492
493 *I = ::std::move(*EltPtr);
494 return I;
495 }
496
497 iterator insert(iterator I, const T &Elt) {
498 if (I == this->end()) { // Important special case for empty vector.
499 this->push_back(Elt);
500 return this->end()-1;
501 }
502
503 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 503, __PRETTY_FUNCTION__))
;
504 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 504, __PRETTY_FUNCTION__))
;
505
506 if (this->size() >= this->capacity()) {
507 size_t EltNo = I-this->begin();
508 this->grow();
509 I = this->begin()+EltNo;
510 }
511 ::new ((void*) this->end()) T(std::move(this->back()));
512 // Push everything else over.
513 std::move_backward(I, this->end()-1, this->end());
514 this->set_size(this->size() + 1);
515
516 // If we just moved the element we're inserting, be sure to update
517 // the reference.
518 const T *EltPtr = &Elt;
519 if (I <= EltPtr && EltPtr < this->end())
520 ++EltPtr;
521
522 *I = *EltPtr;
523 return I;
524 }
525
526 iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
527 // Convert iterator to elt# to avoid invalidating iterator when we reserve()
528 size_t InsertElt = I - this->begin();
529
530 if (I == this->end()) { // Important special case for empty vector.
531 append(NumToInsert, Elt);
532 return this->begin()+InsertElt;
533 }
534
535 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 535, __PRETTY_FUNCTION__))
;
536 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 536, __PRETTY_FUNCTION__))
;
537
538 // Ensure there is enough space.
539 reserve(this->size() + NumToInsert);
540
541 // Uninvalidate the iterator.
542 I = this->begin()+InsertElt;
543
544 // If there are more elements between the insertion point and the end of the
545 // range than there are being inserted, we can use a simple approach to
546 // insertion. Since we already reserved space, we know that this won't
547 // reallocate the vector.
548 if (size_t(this->end()-I) >= NumToInsert) {
549 T *OldEnd = this->end();
550 append(std::move_iterator<iterator>(this->end() - NumToInsert),
551 std::move_iterator<iterator>(this->end()));
552
553 // Copy the existing elements that get replaced.
554 std::move_backward(I, OldEnd-NumToInsert, OldEnd);
555
556 std::fill_n(I, NumToInsert, Elt);
557 return I;
558 }
559
560 // Otherwise, we're inserting more elements than exist already, and we're
561 // not inserting at the end.
562
563 // Move over the elements that we're about to overwrite.
564 T *OldEnd = this->end();
565 this->set_size(this->size() + NumToInsert);
566 size_t NumOverwritten = OldEnd-I;
567 this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
568
569 // Replace the overwritten part.
570 std::fill_n(I, NumOverwritten, Elt);
571
572 // Insert the non-overwritten middle part.
573 std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
574 return I;
575 }
576
577 template <typename ItTy,
578 typename = std::enable_if_t<std::is_convertible<
579 typename std::iterator_traits<ItTy>::iterator_category,
580 std::input_iterator_tag>::value>>
581 iterator insert(iterator I, ItTy From, ItTy To) {
582 // Convert iterator to elt# to avoid invalidating iterator when we reserve()
583 size_t InsertElt = I - this->begin();
584
585 if (I == this->end()) { // Important special case for empty vector.
586 append(From, To);
587 return this->begin()+InsertElt;
588 }
589
590 assert(I >= this->begin() && "Insertion iterator is out of bounds.")((I >= this->begin() && "Insertion iterator is out of bounds."
) ? static_cast<void> (0) : __assert_fail ("I >= this->begin() && \"Insertion iterator is out of bounds.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 590, __PRETTY_FUNCTION__))
;
591 assert(I <= this->end() && "Inserting past the end of the vector.")((I <= this->end() && "Inserting past the end of the vector."
) ? static_cast<void> (0) : __assert_fail ("I <= this->end() && \"Inserting past the end of the vector.\""
, "/build/llvm-toolchain-snapshot-11~++20200309111110+2c36c23f347/llvm/include/llvm/ADT/SmallVector.h"
, 591, __PRETTY_FUNCTION__))
;
592
593 size_t NumToInsert = std::distance(From, To);
594
595 // Ensure there is enough space.
596 reserve(this->size() + NumToInsert);
597
598 // Uninvalidate the iterator.
599 I = this->begin()+InsertElt;
600
601 // If there are more elements between the insertion point and the end of the
602 // range than there are being inserted, we can use a simple approach to
603 // insertion. Since we already reserved space, we know that this won't
604 // reallocate the vector.
605 if (size_t(this->end()-I) >= NumToInsert) {
606 T *OldEnd = this->end();
607 append(std::move_iterator<iterator>(this->end() - NumToInsert),
608 std::move_iterator<iterator>(this->end()));
609
610 // Copy the existing elements that get replaced.
611 std::move_backward(I, OldEnd-NumToInsert, OldEnd);
612
613 std::copy(From, To, I);
614 return I;
615 }
616
617 // Otherwise, we're inserting more elements than exist already, and we're
618 // not inserting at the end.
619
620 // Move over the elements that we're about to overwrite.
621 T *OldEnd = this->end();
622 this->set_size(this->size() + NumToInsert);
623 size_t NumOverwritten = OldEnd-I;
624 this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
625
626 // Replace the overwritten part.
627 for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
628 *J = *From;
629 ++J; ++From;
630 }
631
632 // Insert the non-overwritten middle part.
633 this->uninitialized_copy(From, To, OldEnd);
634 return I;
635 }
636
637 void insert(iterator I, std::initializer_list<T> IL) {
638 insert(I, IL.begin(), IL.end());
639 }
640
641 template <typename... ArgTypes> reference emplace_back(ArgTypes &&... Args) {
642 if (LLVM_UNLIKELY(this->size() >= this->capacity())__builtin_expect((bool)(this->size() >= this->capacity
()), false)
)
643 this->grow();
644 ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
645 this->set_size(this->size() + 1);
646 return this->back();
647 }
648
649 SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
650
651 SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
652
653 bool operator==(const SmallVectorImpl &RHS) const {
654 if (this->size() != RHS.size()) return false;
655 return std::equal(this->begin(), this->end(), RHS.begin());
656 }
657 bool operator!=(const SmallVectorImpl &RHS) const {
658 return !(*this == RHS);
659 }
660
661 bool operator<(const SmallVectorImpl &RHS) const {
662 return std::lexicographical_compare(this->begin(), this->end(),
663 RHS.begin(), RHS.end());
664 }
665};
666
667template <typename T>
668void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
669 if (this == &RHS) return;
670
671 // We can only avoid copying elements if neither vector is small.
672 if (!this->isSmall() && !RHS.isSmall()) {
673 std::swap(this->BeginX, RHS.BeginX);
674 std::swap(this->Size, RHS.Size);
675 std::swap(this->Capacity, RHS.Capacity);
676 return;
677 }
678 if (RHS.size() > this->capacity())
679 this->grow(RHS.size());
680 if (this->size() > RHS.capacity())
681 RHS.grow(this->size());
682
683 // Swap the shared elements.
684 size_t NumShared = this->size();
685 if (NumShared > RHS.size()) NumShared = RHS.size();
686 for (size_type i = 0; i != NumShared; ++i)
687 std::swap((*this)[i], RHS[i]);
688
689 // Copy over the extra elts.
690 if (this->size() > RHS.size()) {
691 size_t EltDiff = this->size() - RHS.size();
692 this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
693 RHS.set_size(RHS.size() + EltDiff);
694 this->destroy_range(this->begin()+NumShared, this->end());
695 this->set_size(NumShared);
696 } else if (RHS.size() > this->size()) {
697 size_t EltDiff = RHS.size() - this->size();
698 this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
699 this->set_size(this->size() + EltDiff);
700 this->destroy_range(RHS.begin()+NumShared, RHS.end());
701 RHS.set_size(NumShared);
702 }
703}
704
705template <typename T>
706SmallVectorImpl<T> &SmallVectorImpl<T>::
707 operator=(const SmallVectorImpl<T> &RHS) {
708 // Avoid self-assignment.
709 if (this == &RHS) return *this;
710
711 // If we already have sufficient space, assign the common elements, then
712 // destroy any excess.
713 size_t RHSSize = RHS.size();
714 size_t CurSize = this->size();
715 if (CurSize >= RHSSize) {
716 // Assign common elements.
717 iterator NewEnd;
718 if (RHSSize)
719 NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
720 else
721 NewEnd = this->begin();
722
723 // Destroy excess elements.
724 this->destroy_range(NewEnd, this->end());
725
726 // Trim.
727 this->set_size(RHSSize);
728 return *this;
729 }
730
731 // If we have to grow to have enough elements, destroy the current elements.
732 // This allows us to avoid copying them during the grow.
733 // FIXME: don't do this if they're efficiently moveable.
734 if (this->capacity() < RHSSize) {
735 // Destroy current elements.
736 this->destroy_range(this->begin(), this->end());
737 this->set_size(0);
738 CurSize = 0;
739 this->grow(RHSSize);
740 } else if (CurSize) {
741 // Otherwise, use assignment for the already-constructed elements.
742 std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
743 }
744
745 // Copy construct the new elements in place.
746 this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
747 this->begin()+CurSize);
748
749 // Set end.
750 this->set_size(RHSSize);
751 return *this;
752}
753
754template <typename T>
755SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
756 // Avoid self-assignment.
757 if (this == &RHS) return *this;
758
759 // If the RHS isn't small, clear this vector and then steal its buffer.
760 if (!RHS.isSmall()) {
761 this->destroy_range(this->begin(), this->end());
762 if (!this->isSmall()) free(this->begin());
763 this->BeginX = RHS.BeginX;
764 this->Size = RHS.Size;
765 this->Capacity = RHS.Capacity;
766 RHS.resetToSmall();
767 return *this;
768 }
769
770 // If we already have sufficient space, assign the common elements, then
771 // destroy any excess.
772 size_t RHSSize = RHS.size();
773 size_t CurSize = this->size();
774 if (CurSize >= RHSSize) {
775 // Assign common elements.
776 iterator NewEnd = this->begin();
777 if (RHSSize)
778 NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
779
780 // Destroy excess elements and trim the bounds.
781 this->destroy_range(NewEnd, this->end());
782 this->set_size(RHSSize);
783
784 // Clear the RHS.
785 RHS.clear();
786
787 return *this;
788 }
789
790 // If we have to grow to have enough elements, destroy the current elements.
791 // This allows us to avoid copying them during the grow.
792 // FIXME: this may not actually make any sense if we can efficiently move
793 // elements.
794 if (this->capacity() < RHSSize) {
795 // Destroy current elements.
796 this->destroy_range(this->begin(), this->end());
797 this->set_size(0);
798 CurSize = 0;
799 this->grow(RHSSize);
800 } else if (CurSize) {
801 // Otherwise, use assignment for the already-constructed elements.
802 std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
803 }
804
805 // Move-construct the new elements in place.
806 this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
807 this->begin()+CurSize);
808
809 // Set end.
810 this->set_size(RHSSize);
811
812 RHS.clear();
813 return *this;
814}
815
816/// Storage for the SmallVector elements. This is specialized for the N=0 case
817/// to avoid allocating unnecessary storage.
818template <typename T, unsigned N>
819struct SmallVectorStorage {
820 AlignedCharArrayUnion<T> InlineElts[N];
821};
822
823/// We need the storage to be properly aligned even for small-size of 0 so that
824/// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is
825/// well-defined.
826template <typename T> struct alignas(alignof(T)) SmallVectorStorage<T, 0> {};
827
828/// This is a 'vector' (really, a variable-sized array), optimized
829/// for the case when the array is small. It contains some number of elements
830/// in-place, which allows it to avoid heap allocation when the actual number of
831/// elements is below that threshold. This allows normal "small" cases to be
832/// fast without losing generality for large inputs.
833///
834/// Note that this does not attempt to be exception safe.
835///
836template <typename T, unsigned N>
837class SmallVector : public SmallVectorImpl<T>, SmallVectorStorage<T, N> {
838public:
839 SmallVector() : SmallVectorImpl<T>(N) {}
840
841 ~SmallVector() {
842 // Destroy the constructed elements in the vector.
843 this->destroy_range(this->begin(), this->end());
844 }
845
846 explicit SmallVector(size_t Size, const T &Value = T())
847 : SmallVectorImpl<T>(N) {
848 this->assign(Size, Value);
849 }
850
851 template <typename ItTy,
852 typename = std::enable_if_t<std::is_convertible<
853 typename std::iterator_traits<ItTy>::iterator_category,
854 std::input_iterator_tag>::value>>
855 SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
856 this->append(S, E);
857 }
858
859 template <typename RangeTy>
860 explicit SmallVector(const iterator_range<RangeTy> &R)
861 : SmallVectorImpl<T>(N) {
862 this->append(R.begin(), R.end());
863 }
864
865 SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
866 this->assign(IL);
867 }
868
869 SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
870 if (!RHS.empty())
871 SmallVectorImpl<T>::operator=(RHS);
872 }
873
874 const SmallVector &operator=(const SmallVector &RHS) {
875 SmallVectorImpl<T>::operator=(RHS);
876 return *this;
877 }
878
879 SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
880 if (!RHS.empty())
881 SmallVectorImpl<T>::operator=(::std::move(RHS));
882 }
883
884 SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
885 if (!RHS.empty())
886 SmallVectorImpl<T>::operator=(::std::move(RHS));
887 }
888
889 const SmallVector &operator=(SmallVector &&RHS) {
890 SmallVectorImpl<T>::operator=(::std::move(RHS));
891 return *this;
892 }
893
894 const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
895 SmallVectorImpl<T>::operator=(::std::move(RHS));
896 return *this;
897 }
898
899 const SmallVector &operator=(std::initializer_list<T> IL) {
900 this->assign(IL);
901 return *this;
902 }
903};
904
905template <typename T, unsigned N>
906inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
907 return X.capacity_in_bytes();
908}
909
910/// Given a range of type R, iterate the entire range and return a
911/// SmallVector with elements of the vector. This is useful, for example,
912/// when you want to iterate a range and then sort the results.
913template <unsigned Size, typename R>
914SmallVector<typename std::remove_const<typename std::remove_reference<
915 decltype(*std::begin(std::declval<R &>()))>::type>::type,
916 Size>
917to_vector(R &&Range) {
918 return {std::begin(Range), std::end(Range)};
919}
920
921} // end namespace llvm
922
923namespace std {
924
925 /// Implement std::swap in terms of SmallVector swap.
926 template<typename T>
927 inline void
928 swap(llvm::SmallVectorImpl<T> &LHS, llvm::SmallVectorImpl<T> &RHS) {
929 LHS.swap(RHS);
930 }
931
932 /// Implement std::swap in terms of SmallVector swap.
933 template<typename T, unsigned N>
934 inline void
935 swap(llvm::SmallVector<T, N> &LHS, llvm::SmallVector<T, N> &RHS) {
936 LHS.swap(RHS);
937 }
938
939} // end namespace std
940
941#endif // LLVM_ADT_SMALLVECTOR_H